示例#1
0
 def __init__(self):
     asm = Tdasm()
     m = asm.assemble(MEMCPY)
     self.r = Runtime()
     self.ds = self.r.load("memcpy", m)
     m2 = asm.assemble(BLTRGBA)
     self.ds2 = self.r.load("bltrgba", m2)
     m3 = asm.assemble(BLTFLOATRGBA)
     self.ds3 = self.r.load("bltfloatrgba", m3)
示例#2
0
文件: shader.py 项目: mario007/renmas
    def prepare(self, runtimes):
        self._load_color_funcs(runtimes)

        if self.loader:
            self.loader(runtimes)

        for s in self._shaders:
            s.prepare(runtimes)

        self._runtimes = runtimes
        asm = Tdasm()
        name = 'shader' + str(id(self))

        for fun in self._functions:
            fun_name, fun_label, avx, bit = fun
            load_asm_function(fun_name, fun_label, runtimes, avx, bit)

        ds = []
        for r in runtimes:
            if not r.global_exists(self._name):
                if self._name in self._mc_cache:
                    ds.append(r.load(name, self._mc_cache[self._name])) 
                else:
                    mc = asm.assemble(self._code, self._func)
                    self._mc_cache[self._name] = mc
                    ds.append(r.load(name, mc)) 
        if ds:
            self._ds = ds
示例#3
0
    def prepare(self, runtimes):
        self._load_color_funcs(runtimes)

        if self.loader:
            self.loader(runtimes)

        for s in self._shaders:
            s.prepare(runtimes)

        self._runtimes = runtimes
        asm = Tdasm()
        name = 'shader' + str(id(self))

        for fun in self._functions:
            fun_name, fun_label, avx, bit = fun
            load_asm_function(fun_name, fun_label, runtimes, avx, bit)

        ds = []
        for r in runtimes:
            if not r.global_exists(self._name):
                if self._name in self._mc_cache:
                    ds.append(r.load(name, self._mc_cache[self._name]))
                else:
                    mc = asm.assemble(self._code, self._func)
                    self._mc_cache[self._name] = mc
                    ds.append(r.load(name, mc))
        if ds:
            self._ds = ds
示例#4
0
    def test_sincos_ps(self):
        asm = Tdasm()
        mc = asm.assemble(SINCOS_CODE_PS)
        runtime = Runtime()
        load_math_func("fast_sincos_ps", runtime)
        ds = runtime.load("sincos_ps", mc)

        for x in range(1000):
            num1 = random.random() * 2000
            num2 = random.random() * 2000
            num3 = random.random() * 2000
            num4 = random.random() * 2000
            ds["v1"] = (num1, num2, num3, num4) 
            runtime.run("sincos_ps")
            rez_asm_sin = ds["v1"]
            rez_asm_cos = ds["v2"]
            rez_py1_sin = math.sin(num1)
            rez_py2_sin = math.sin(num2)
            rez_py3_sin = math.sin(num3)
            rez_py4_sin = math.sin(num4)
            rez_py1_cos = math.cos(num1)
            rez_py2_cos = math.cos(num2)
            rez_py3_cos = math.cos(num3)
            rez_py4_cos = math.cos(num4)

            self.assertAlmostEqual(rez_asm_sin[0], rez_py1_sin, 3)
            self.assertAlmostEqual(rez_asm_sin[1], rez_py2_sin, 3)
            self.assertAlmostEqual(rez_asm_sin[2], rez_py3_sin, 3)
            self.assertAlmostEqual(rez_asm_sin[3], rez_py4_sin, 3)
            self.assertAlmostEqual(rez_asm_cos[0], rez_py1_cos, 3)
            self.assertAlmostEqual(rez_asm_cos[1], rez_py2_cos, 3)
            self.assertAlmostEqual(rez_asm_cos[2], rez_py3_cos, 3)
            self.assertAlmostEqual(rez_asm_cos[3], rez_py4_cos, 3)
示例#5
0
    def test_sincos_ps(self):
        asm = Tdasm()
        mc = asm.assemble(SINCOS_CODE_PS)
        runtime = Runtime()
        load_math_func("fast_sincos_ps", runtime)
        ds = runtime.load("sincos_ps", mc)

        for x in range(1000):
            num1 = random.random() * 2000
            num2 = random.random() * 2000
            num3 = random.random() * 2000
            num4 = random.random() * 2000
            ds["v1"] = (num1, num2, num3, num4)
            runtime.run("sincos_ps")
            rez_asm_sin = ds["v1"]
            rez_asm_cos = ds["v2"]
            rez_py1_sin = math.sin(num1)
            rez_py2_sin = math.sin(num2)
            rez_py3_sin = math.sin(num3)
            rez_py4_sin = math.sin(num4)
            rez_py1_cos = math.cos(num1)
            rez_py2_cos = math.cos(num2)
            rez_py3_cos = math.cos(num3)
            rez_py4_cos = math.cos(num4)

            self.assertAlmostEqual(rez_asm_sin[0], rez_py1_sin, 3)
            self.assertAlmostEqual(rez_asm_sin[1], rez_py2_sin, 3)
            self.assertAlmostEqual(rez_asm_sin[2], rez_py3_sin, 3)
            self.assertAlmostEqual(rez_asm_sin[3], rez_py4_sin, 3)
            self.assertAlmostEqual(rez_asm_cos[0], rez_py1_cos, 3)
            self.assertAlmostEqual(rez_asm_cos[1], rez_py2_cos, 3)
            self.assertAlmostEqual(rez_asm_cos[2], rez_py3_cos, 3)
            self.assertAlmostEqual(rez_asm_cos[3], rez_py4_cos, 3)
示例#6
0
    def test_pow_ps(self):
        asm = Tdasm()
        mc = asm.assemble(POW_CODE_PS)
        runtime = Runtime()
        load_math_func("fast_pow_ps", runtime)
        ds = runtime.load("pow_ps", mc)

        for x in range(1000):
            num1 = random.random() * 3
            num2 = random.random() * 3
            num3 = random.random() * 3
            num4 = random.random() * 3
            num5 = random.random() * 3
            num6 = random.random() * 3
            num7 = random.random() * 3
            num8 = random.random() * 3
            ds["v1"] = (num1, num2, num3, num4)
            ds["v2"] = (num5, num6, num7, num8)
            runtime.run("pow_ps")
            rez_asm = ds["v1"]
            rez_py1 = math.pow(num1, num5)
            rez_py2 = math.pow(num2, num6)
            rez_py3 = math.pow(num3, num7)
            rez_py4 = math.pow(num4, num8)

            self.assertAlmostEqual(rez_asm[0], rez_py1, 1)
            self.assertAlmostEqual(rez_asm[1], rez_py2, 1)
            self.assertAlmostEqual(rez_asm[2], rez_py3, 1)
            self.assertAlmostEqual(rez_asm[3], rez_py4, 1)
示例#7
0
    def test_pow_ps(self):
        asm = Tdasm()
        mc = asm.assemble(POW_CODE_PS)
        runtime = Runtime()
        load_math_func("fast_pow_ps", runtime)
        ds = runtime.load("pow_ps", mc)

        for x in range(1000):
            num1 = random.random() * 3 
            num2 = random.random() * 3
            num3 = random.random() * 3
            num4 = random.random() * 3
            num5 = random.random() * 3 
            num6 = random.random() * 3
            num7 = random.random() * 3
            num8 = random.random() * 3
            ds["v1"] = (num1, num2, num3, num4) 
            ds["v2"] = (num5, num6, num7, num8)
            runtime.run("pow_ps")
            rez_asm = ds["v1"]
            rez_py1 = math.pow(num1, num5)
            rez_py2 = math.pow(num2, num6)
            rez_py3 = math.pow(num3, num7)
            rez_py4 = math.pow(num4, num8)

            self.assertAlmostEqual(rez_asm[0], rez_py1, 1)
            self.assertAlmostEqual(rez_asm[1], rez_py2, 1)
            self.assertAlmostEqual(rez_asm[2], rez_py3, 1)
            self.assertAlmostEqual(rez_asm[3], rez_py4, 1)
示例#8
0
def create_float_image(runtime):
    img = renmas.gui.ImageFloatRGBA(150, 150)
    
    img.set_pixel_asm(runtime, "set_pixel")

    asm = Tdasm()
    mc = asm.assemble(ASM)
    runtime.load("write", mc)
    runtime.run("write")
    return img
示例#9
0
def create_float_image(runtime):
    img = renmas.gui.ImageFloatRGBA(150, 150)

    img.set_pixel_asm(runtime, "set_pixel")

    asm = Tdasm()
    mc = asm.assemble(ASM)
    runtime.load("write", mc)
    runtime.run("write")
    return img
示例#10
0
文件: shader.py 项目: mario007/renmas
 def compile(self, shaders=[]):
     stms = parse(self._code)
     cgen = CodeGenerator()
     asm, ret_type = cgen.generate_code(
         stms, args=self._args, is_func=self._is_func, name=self._name, func_args=self._func_args, shaders=shaders
     )
     self._asm_code = asm
     self._ret_type = ret_type
     asm = Tdasm()
     self._mc = asm.assemble(self._asm_code, self._is_func)
示例#11
0
文件: sam.py 项目: mario007/renmas
def regular_sampler():
    runtime = Runtime()
    sampler = renmas2.samplers.RegularSampler(2, 2, pixel=1.0)
    sampler.get_sample_asm([runtime], 'get_sample')
    tile = renmas2.core.Tile(0, 0, 2, 2)
    tile.split(1)
    sampler.set_tile(tile)
    asm = Tdasm()
    mc = asm.assemble(ASM_CODE)
    runtime.load("test", mc)
    return (sampler, runtime, 'test')
示例#12
0
 def __init__(self, width, height, pitch, address):
     self.addr = address
     self.width = width
     self.height = height
     asm = Tdasm()
     m = asm.assemble(ASM_STR)
     self.r = Runtime()
     self.ds = self.r.load("set_pixel", m)
     self.ds["color"] = 0xFF00FF00  # red color is default
     self.ds["address"] = address
     self.ds["width"] = width
     self.ds["height"] = height
     self.ds["pitch"] = pitch
示例#13
0
文件: shader.py 项目: mario007/renmas
 def prepare(self, runtimes):
     for s in self._shaders:
         s.prepare(runtimes)
     self._ds = []
     asm = Tdasm()
     mc = asm.assemble(self._code, self._func)
     #mc.print_machine_code()
     name = 'shader' + str(id(self))
     self._runtimes = runtimes
     for r in runtimes:
         #TODO check if shader allread exist in runtime
         #TODO if shader is function load it as function
         self._ds.append(r.load(name, mc)) 
示例#14
0
文件: shader.py 项目: mario007/renmas
 def prepare(self, runtimes):
     for s in self._shaders:
         s.prepare(runtimes)
     self._ds = []
     asm = Tdasm()
     mc = asm.assemble(self._code, self._func)
     #mc.print_machine_code()
     name = 'shader' + str(id(self))
     self._runtimes = runtimes
     for r in runtimes:
         #TODO check if shader allread exist in runtime
         #TODO if shader is function load it as function
         self._ds.append(r.load(name, mc))
示例#15
0
 def compile(self, shaders=[]):
     stms = parse(self._code)
     cgen = CodeGenerator()
     asm, ret_type = cgen.generate_code(stms,
                                        args=self._args,
                                        is_func=self._is_func,
                                        name=self._name,
                                        func_args=self._func_args,
                                        shaders=shaders)
     self._asm_code = asm
     self._ret_type = ret_type
     asm = Tdasm()
     self._mc = asm.assemble(self._asm_code, self._is_func)
示例#16
0
    def test_atan(self):
        asm = Tdasm()
        mc = asm.assemble(ATAN_CODE)
        runtime = Runtime()
        load_math_func("fast_atan_ss", runtime)
        ds = runtime.load("atan", mc)

        for x in range(1000):
            num = random.random() * 2000
            ds["x"] = num 
            runtime.run("atan")
            rez_asm = ds["x"]
            rez_py = math.atan(num)
            self.assertAlmostEqual(rez_asm, rez_py, 3)
示例#17
0
    def test_log(self):
        asm = Tdasm()
        mc = asm.assemble(LOG_CODE)
        runtime = Runtime()
        load_math_func("fast_log_ss", runtime)
        ds = runtime.load("log", mc)

        for x in range(1000):
            num = random.random()
            ds["x"] = num
            runtime.run("log")
            rez_asm = ds["x"]
            rez_py = math.log(num)
            self.assertAlmostEqual(rez_asm, rez_py, 3)
示例#18
0
    def test_exp(self):
        asm = Tdasm()
        mc = asm.assemble(EXP_CODE)
        runtime = Runtime()
        load_math_func("fast_exp_ss", runtime)
        ds = runtime.load("exp", mc)

        for x in range(1000):
            num = random.random() * 4
            ds["x"] = num
            runtime.run("exp")
            rez_asm = ds["x"]
            rez_py = math.exp(num)
            self.assertAlmostEqual(rez_asm, rez_py, 2)
示例#19
0
    def test_log(self):
        asm = Tdasm()
        mc = asm.assemble(LOG_CODE)
        runtime = Runtime()
        load_math_func("fast_log_ss", runtime)
        ds = runtime.load("log", mc)

        for x in range(1000):
            num = random.random()  
            ds["x"] = num 
            runtime.run("log")
            rez_asm = ds["x"]
            rez_py = math.log(num)
            self.assertAlmostEqual(rez_asm, rez_py, 3)
示例#20
0
    def test_exp(self):
        asm = Tdasm()
        mc = asm.assemble(EXP_CODE)
        runtime = Runtime()
        load_math_func("fast_exp_ss", runtime)
        ds = runtime.load("exp", mc)

        for x in range(1000):
            num = random.random() * 4 
            ds["x"] = num 
            runtime.run("exp")
            rez_asm = ds["x"]
            rez_py = math.exp(num)
            self.assertAlmostEqual(rez_asm, rez_py, 2)
示例#21
0
    def test_atan(self):
        asm = Tdasm()
        mc = asm.assemble(ATAN_CODE)
        runtime = Runtime()
        load_math_func("fast_atan_ss", runtime)
        ds = runtime.load("atan", mc)

        for x in range(1000):
            num = random.random() * 2000
            ds["x"] = num
            runtime.run("atan")
            rez_asm = ds["x"]
            rez_py = math.atan(num)
            self.assertAlmostEqual(rez_asm, rez_py, 3)
示例#22
0
class Structures:
    def __init__(self, renderer):
        self.tdasm = Tdasm()
        self.renderer = renderer

        self._line1 = "struct spectrum \n"
        self._line3 = "end struct \n"

    def get_struct(self, name):
        if name in structures:
            return structures[name]
        elif name == "spectrum":
            if self.renderer.spectral_rendering:
                line2 = "float values[" + str(
                    self.renderer.nspectrum_samples) + "] \n"
            else:
                line2 = "float values[4] \n"
            return self._line1 + line2 + self._line3
        elif name == "hitpoint":
            if self.renderer.spectral_rendering:
                line2 = "float values[" + str(
                    self.renderer.nspectrum_samples) + "] \n"
            else:
                line2 = "float values[4] \n"
            spec = self._line1 + line2 + self._line3
            return spec + HITPOINT
        return None

    def get_compiled_struct(self, name):
        if name in structures:
            asm_code = """ #DATA
            """
            asm_code += self.get_struct(name)
            asm_code += """
            #CODE
            #END
            """
            mc = self.tdasm.assemble(asm_code)
            return mc.get_struct(name)
        return None

    def structs(self, names):
        code = ""
        for name in names:
            struct = self.get_struct(name)
            if struct is None:
                raise ValueError("Structure " + str(name) + " doesn't exist!")
            code += struct
        return code
示例#23
0
文件: shader.py 项目: mario007/renmas
 def compile(self, shaders=[], color_mgr=None):
     stms = parse(self._code)
     cgen = CodeGenerator()
     asm, ret_type, fns = cgen.generate_code(stms, args=self._args,
                                             is_func=self._is_func,
                                             name=self._name,
                                             func_args=self._func_args,
                                             shaders=shaders,
                                             color_mgr=color_mgr)
     self._asm_code = asm
     self._ret_type = ret_type
     self._ext_functions = fns
     asm = Tdasm()
     self._mc = asm.assemble(self._asm_code, naked=self._is_func,
                             ia32=not cgen.BIT64)
示例#24
0
    def set_pixel_asm(self, runtime, label):
        
        bits = platform.architecture()[0]
        if bits == "64bit": ecx = "rcx"
        else: ecx = "ecx"

        if util.AVX:
            line = "vmovaps oword [" + ecx + "], xmm0"
        else:
            line = "movaps oword [" + ecx + "], xmm0"

        bits = platform.architecture()[0]
        if bits == "64bit":
            l1 = "uint64 ptr_buffer"
            l2 = "mov rcx, qword [ptr_buffer]"
            l3 = "add rcx, rax"
        else:
            l1 = "uint32 ptr_buffer"
            l2 = "mov ecx, dword [ptr_buffer]"
            l3 = "add ecx, eax"

        asm_code = """
        #DATA
        """
        asm_code += l1 + """
        uint32 pitch
        #CODE
        ; eax = x , ebx = y, value = xmm0
        """
        asm_code += "global " + label + ": \n"
        asm_code += """
        imul ebx, dword [pitch]
        imul eax , eax, 16
        """
        asm_code += l2 + """
        add eax, ebx
        """
        asm_code += l3 + "\n"
        asm_code += line + """
        ret
        """

        asm = Tdasm()
        mc = asm.assemble(asm_code, True)
        name = "ImageFloatRGBA" + str(hash(self)) 
        self.ds = runtime.load(name, mc)
        self.ds["ptr_buffer"] = self.pixels.ptr()
        self.ds["pitch"] = self.pitch
示例#25
0
文件: image.py 项目: mario007/renmas
    def set_pixel_asm(self, runtime, label):
        
        bits = platform.architecture()[0]
        if bits == "64bit": ecx = "rcx"
        else: ecx = "ecx"

        if util.AVX:
            line = "vmovaps oword [" + ecx + "], xmm0"
        else:
            line = "movaps oword [" + ecx + "], xmm0"

        bits = platform.architecture()[0]
        if bits == "64bit":
            l1 = "uint64 ptr_buffer"
            l2 = "mov rcx, qword [ptr_buffer]"
            l3 = "add rcx, rax"
        else:
            l1 = "uint32 ptr_buffer"
            l2 = "mov ecx, dword [ptr_buffer]"
            l3 = "add ecx, eax"

        asm_code = """
        #DATA
        """
        asm_code += l1 + """
        uint32 pitch
        #CODE
        ; eax = x , ebx = y, value = xmm0
        """
        asm_code += "global " + label + ": \n"
        asm_code += """
        imul ebx, dword [pitch]
        imul eax , eax, 16
        """
        asm_code += l2 + """
        add eax, ebx
        """
        asm_code += l3 + "\n"
        asm_code += line + """
        ret
        """

        asm = Tdasm()
        mc = asm.assemble(asm_code, True)
        name = "ImageFloatRGBA" + str(hash(self)) 
        self.ds = runtime.load(name, mc)
        self.ds["ptr_buffer"] = self.pixels.ptr()
        self.ds["pitch"] = self.pitch
示例#26
0
    def test_pow(self):
        asm = Tdasm()
        mc = asm.assemble(POW_CODE)
        runtime = Runtime()
        load_math_func("fast_pow_ss", runtime)
        ds = runtime.load("pow", mc)

        for x in range(1000):
            num = random.random() * 3
            num1 = random.random() * 3
            ds["x"] = num
            ds["y"] = num1
            runtime.run("pow")
            rez_asm = ds["x"]
            rez_py = math.pow(num, num1)
            self.assertAlmostEqual(rez_asm, rez_py, 1)
示例#27
0
    def test_pow(self):
        asm = Tdasm()
        mc = asm.assemble(POW_CODE)
        runtime = Runtime()
        load_math_func("fast_pow_ss", runtime)
        ds = runtime.load("pow", mc)

        for x in range(1000):
            num = random.random() * 3 
            num1 = random.random() * 3 
            ds["x"] = num 
            ds["y"] = num1 
            runtime.run("pow")
            rez_asm = ds["x"]
            rez_py = math.pow(num, num1)
            self.assertAlmostEqual(rez_asm, rez_py, 1)
示例#28
0
class Structures:
    def __init__(self, renderer):
        self.tdasm = Tdasm()
        self.renderer = renderer

        self._line1 = "struct spectrum \n"
        self._line3 = "end struct \n"

    def get_struct(self, name):
        if name in structures:
            return structures[name]
        elif name == "spectrum":
            if self.renderer.spectral_rendering:
                line2 = "float values[" + str(self.renderer.nspectrum_samples) + "] \n"
            else:
                line2 = "float values[4] \n"
            return self._line1 + line2 + self._line3
        elif name == "hitpoint":
            if self.renderer.spectral_rendering:
                line2 = "float values[" + str(self.renderer.nspectrum_samples) + "] \n"
            else:
                line2 = "float values[4] \n"
            spec = self._line1 + line2 + self._line3
            return spec + HITPOINT
        return None

    def get_compiled_struct(self, name):
        if name in structures:
            asm_code = """ #DATA
            """
            asm_code += self.get_struct(name)
            asm_code += """
            #CODE
            #END
            """
            mc = self.tdasm.assemble(asm_code)
            return mc.get_struct(name)
        return None

    def structs(self, names):
        code = ""
        for name in names:
            struct = self.get_struct(name)
            if struct is None:
                raise ValueError("Structure " + str(name) + " doesn't exist!")
            code += struct
        return code
示例#29
0
    def test_sincos(self):
        asm = Tdasm()
        mc = asm.assemble(SINCOS_CODE)
        runtime = Runtime()
        load_math_func("fast_sincos_ss", runtime)
        ds = runtime.load("sincos", mc)

        for x in range(1000):
            num = random.random() * 2000
            ds["x"] = num
            runtime.run("sincos")
            rez_asm1 = ds["x"]
            rez_asm2 = ds["y"]

            rez_py1, rez_py2 = math.sin(num), math.cos(num)
            self.assertAlmostEqual(rez_asm1, rez_py1, 3)
            self.assertAlmostEqual(rez_asm2, rez_py2, 3)
示例#30
0
文件: shader.py 项目: mario007/renmas
 def compile(self, shaders=[], color_mgr=None):
     stms = parse(self._code)
     cgen = CodeGenerator()
     asm, ret_type, fns = cgen.generate_code(stms,
                                             args=self._args,
                                             is_func=self._is_func,
                                             name=self._name,
                                             func_args=self._func_args,
                                             shaders=shaders,
                                             color_mgr=color_mgr)
     self._asm_code = asm
     self._ret_type = ret_type
     self._ext_functions = fns
     asm = Tdasm()
     self._mc = asm.assemble(self._asm_code,
                             naked=self._is_func,
                             ia32=not cgen.BIT64)
示例#31
0
    def test_sincos(self):
        asm = Tdasm()
        mc = asm.assemble(SINCOS_CODE)
        runtime = Runtime()
        load_math_func("fast_sincos_ss", runtime)
        ds = runtime.load("sincos", mc)

        for x in range(1000):
            num = random.random() * 2000
            ds["x"] = num 
            runtime.run("sincos")
            rez_asm1 = ds["x"]
            rez_asm2 = ds["y"]

            rez_py1, rez_py2 = math.sin(num), math.cos(num)
            self.assertAlmostEqual(rez_asm1, rez_py1, 3)
            self.assertAlmostEqual(rez_asm2, rez_py2, 3)
示例#32
0
文件: sam.py 项目: mario007/renmas
def random_sampler():
    runtime = Runtime()
    width = 1
    height = 1
    spp = 1
    sampler = renmas2.samplers.RandomSampler(width, height, spp=spp, pixel=1.0)
    sampler.get_sample_asm([runtime], 'get_sample')
    tile = renmas2.core.Tile(0, 0, width, height)
    tile.split(1)
    sampler.set_tile(tile)
    asm = Tdasm()
    mc = asm.assemble(ASM_CODE)
    runtime.load("test", mc)

    nsamples = width * height * spp
    for x in range(nsamples):
        get_sample(sampler, runtime, "test")

    get_sample(sampler, runtime, "test")
示例#33
0
    def test_log_ps(self):
        asm = Tdasm()
        mc = asm.assemble(LOG_CODE_PS)
        runtime = Runtime()
        load_math_func("fast_log_ps", runtime)
        ds = runtime.load("log_ps", mc)

        for x in range(1000):
            num1 = random.random() 
            num2 = random.random()
            num3 = random.random() 
            num4 = random.random() 
            ds["v1"] = (num1, num2, num3, num4) 
            runtime.run("log_ps")
            rez_asm = ds["v1"]
            rez_py1 = math.log(num1)
            rez_py2 = math.log(num2)
            rez_py3 = math.log(num3)
            rez_py4 = math.log(num4)

            self.assertAlmostEqual(rez_asm[0], rez_py1, 3)
            self.assertAlmostEqual(rez_asm[1], rez_py2, 3)
            self.assertAlmostEqual(rez_asm[2], rez_py3, 3)
            self.assertAlmostEqual(rez_asm[3], rez_py4, 3)
示例#34
0
    def test_log_ps(self):
        asm = Tdasm()
        mc = asm.assemble(LOG_CODE_PS)
        runtime = Runtime()
        load_math_func("fast_log_ps", runtime)
        ds = runtime.load("log_ps", mc)

        for x in range(1000):
            num1 = random.random()
            num2 = random.random()
            num3 = random.random()
            num4 = random.random()
            ds["v1"] = (num1, num2, num3, num4)
            runtime.run("log_ps")
            rez_asm = ds["v1"]
            rez_py1 = math.log(num1)
            rez_py2 = math.log(num2)
            rez_py3 = math.log(num3)
            rez_py4 = math.log(num4)

            self.assertAlmostEqual(rez_asm[0], rez_py1, 3)
            self.assertAlmostEqual(rez_asm[1], rez_py2, 3)
            self.assertAlmostEqual(rez_asm[2], rez_py3, 3)
            self.assertAlmostEqual(rez_asm[3], rez_py4, 3)
示例#35
0
文件: cosss.py 项目: mario007/renmas
def cos_ss():
    data = """
    #DATA
    uint32 _ps_am_inv_sign_mask[4] = 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF
    float _ps_am_pi_o_2[4] = 1.57079632679, 1.57079632679, 1.57079632679, 1.57079632679
    float _ps_am_2_o_pi[4] = 0.63661977236, 0.63661977236, 0.63661977236, 0.63661977236
    uint32 _epi32_1[4] = 1, 1, 1, 1
    float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0
    uint32 _epi32_2[4] = 2, 2, 2, 2

    float _ps_sincos_p3[4] = -0.00468175413, -0.00468175413, -0.00468175413, -0.00468175413
    float _ps_sincos_p2[4] = 0.0796926262, 0.0796926262, 0.0796926262, 0.0796926262
    float _ps_sincos_p1[4] = -0.64596409750621,-0.64596409750621,-0.64596409750621,-0.64596409750621 
    float _ps_sincos_p0[4] = 1.570796326794896, 1.570796326794896, 1.570796326794896, 1.570796326794896
    """

    asm_code = data + """

    #CODE
    global fast_cos_ss:
    movss	xmm1, dword [_ps_am_inv_sign_mask]
    movss	xmm2, dword [_ps_am_pi_o_2]
    movss	xmm3, dword [_ps_am_2_o_pi]
    andps	xmm0, xmm1
    addss	xmm0, xmm2
    mulss	xmm0, xmm3

    pxor	xmm3, xmm3
    movd	xmm5, dword [_epi32_1]
    movss	xmm4, dword [_ps_am_1]
    cvttps2dq	xmm2, xmm0
    pand	xmm5, xmm2
    movd	xmm1, dword [_epi32_2]
    pcmpeqd	xmm5, xmm3
    cvtdq2ps	xmm6, xmm2
    pand	xmm2, xmm1
    pslld	xmm2, 30 

    subss	xmm0, xmm6
    movss	xmm3, dword [_ps_sincos_p3]
    minss	xmm0, xmm4
    subss	xmm4, xmm0
    andps	xmm0, xmm5
    andnps	xmm5, xmm4
    orps	xmm0, xmm5

    movaps	xmm1, xmm0
    movss	xmm4, dword [_ps_sincos_p2]
    mulss	xmm0, xmm0
    movss	xmm5, dword [_ps_sincos_p1]
    orps	xmm1, xmm2
    movaps	xmm7, xmm0
    mulss	xmm0, xmm3
    movss	xmm6, dword [_ps_sincos_p0]
    addss	xmm0, xmm4
    mulss	xmm0, xmm7
    addss	xmm0, xmm5
    mulss	xmm0, xmm7
    addss	xmm0, xmm6
    mulss	xmm0, xmm1
    ret
    """

    avx_code = data + """

    #CODE
    global fast_cos_ss:
    vmovss	xmm1, dword [_ps_am_inv_sign_mask]
    vmovss	xmm2, dword [_ps_am_pi_o_2]
    vmovss	xmm3, dword [_ps_am_2_o_pi]
    vandps	xmm0, xmm0, xmm1
    vaddss	xmm0, xmm0, xmm2
    vmulss	xmm0, xmm0, xmm3

    vpxor	xmm3, xmm3, xmm3
    vmovd	xmm5, dword [_epi32_1]
    vmovss	xmm4, dword [_ps_am_1]
    vcvttps2dq	xmm2, xmm0
    vpand	xmm5, xmm5, xmm2
    vmovd	xmm1, dword [_epi32_2]
    vpcmpeqd	xmm5, xmm5, xmm3
    vcvtdq2ps	xmm6, xmm2
    vpand	xmm2, xmm2, xmm1
    vpslld	xmm2, xmm2, 30 

    vsubss	xmm0, xmm0, xmm6
    vmovss	xmm3, dword [_ps_sincos_p3]
    vminss	xmm0, xmm0, xmm4
    vsubss	xmm4, xmm4, xmm0
    vandps	xmm0, xmm0, xmm5
    vandnps	xmm5, xmm5, xmm4
    vorps	xmm0, xmm0, xmm5

    vmovaps	xmm1, xmm0
    vmovss	xmm4, dword [_ps_sincos_p2]
    vmulss	xmm0, xmm0, xmm0
    vmovss	xmm5, dword [_ps_sincos_p1]
    vorps	xmm1, xmm1, xmm2
    vmovaps	xmm7, xmm0
    vmulss	xmm0, xmm0, xmm3
    vmovss	xmm6, dword [_ps_sincos_p0]
    vaddss	xmm0, xmm0, xmm4
    vmulss	xmm0, xmm0, xmm7
    vaddss	xmm0, xmm0, xmm5
    vmulss	xmm0, xmm0, xmm7
    vaddss	xmm0, xmm0, xmm6
    vmulss	xmm0, xmm0, xmm1
    ret
    """

    asm = Tdasm()
    if proc.AVX:
        mc = asm.assemble(avx_code, True)
    else:
        mc = asm.assemble(asm_code, True)
   
    return mc
示例#36
0
        add dword [y], 1
        jmp _bltrgba
        _endblt:
        ret 
    """ 
    return code

def _blt_floatrgba_code(bgra=True):
    bits = platform.architecture()[0]
    if bits == '64bit':
        return _blt_floatrgba_code64(bgra)
    else:
        return _blt_floatrgba_code32(bgra)

_asm = Tdasm()
_mc = _asm.assemble(_blt_floatrgba_code())
_runtime = Runtime()
_data_section = _runtime.load("blt_prgba_to_bgra", _mc)

_mc2 = _asm.assemble(_blt_floatrgba_code(bgra=False))
_data_section2 = _runtime.load("blt_prgba_to_rgba", _mc2)

# blt float rgba to byte bgra
def blt_prgba_to_bgra(src, dest):

    assert isinstance(src, ImagePRGBA)
    assert isinstance(dest, ImageBGRA)

    sa, spitch = src.address_info() 
    da, dpitch = dest.address_info()
    dx = dy = sx = sy = 0
示例#37
0
文件: acosss.py 项目: mario007/renmas
def acos_ss():
    data = """
    #DATA
    uint32 _ps_am_sign_mask[4] = 0x80000000, 0x80000000, 0x80000000, 0x80000000
    float _ps_am_m1[4] = -1.0, -1.0, -1.0, -1.0
    float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0
    float _ps_atan_t0[4] = -0.091646118527, -0.091646118527, -0.091646118527, -0.091646118527
    float _ps_atan_s0[4] = 1.2797564625, 1.2797564625, 1.2797564625, 1.2797564625
    float _ps_atan_s1[4] = 2.1972168858, 2.1972168858, 2.1972168858, 2.1972168858
    float _ps_atan_t1[4] = -1.395694568, -1.395694568, -1.395694568, -1.395694568
    float _ps_atan_s2[4] = 6.8193064723, 6.8193064723, 6.8193064723 ,6.8193064723
    float _ps_atan_t2[4] = -94.3939261227, -94.3939261227, -94.3939261227, -94.3939261227
    float _ps_atan_s3[4] = 28.205206687, 28.205206687, 28.205206687, 28.205206687
    float _ps_atan_t3[4] = 12.888383034, 12.888383034, 12.888383034, 12.888383034
    float _ps_am_pi_o_2[4] = 1.57079632679, 1.57079632679, 1.57079632679, 1.57079632679

    """
    asm_code = data + """
    #CODE
    global fast_acos_ss:
    movss xmm1, dword [_ps_am_1]
    movss xmm2, xmm1
    subss xmm1, xmm0
    addss xmm2, xmm0
    rcpss xmm1, xmm1
    mulss xmm2, xmm1
    rsqrtss xmm0, xmm2

    ;atan
    movss	xmm1, dword [_ps_am_sign_mask]
	rcpss	xmm4, xmm0
	orps	xmm1, xmm0
	movss	xmm6, xmm4
	comiss	xmm1, dword [_ps_am_m1]
	movss	xmm3, dword [_ps_atan_t0]
	jnc		l_small  ; 'c' is 'lt' for comiss

    ;l_big:
	mulss	xmm6, xmm6

	movss	xmm5, dword [_ps_atan_s0]
	addss	xmm5, xmm6

	movss	xmm7, dword [_ps_atan_s1]
	rcpss	xmm5, xmm5
	mulss	xmm5, xmm3
	movss	xmm3, dword [_ps_atan_t1]
	addss	xmm7, xmm6
	addss	xmm5, xmm7

	movss	xmm7, dword [_ps_atan_s2]
	rcpss	xmm5, xmm5
	mulss	xmm5, xmm3
	movss	xmm3, dword [_ps_atan_t2]
	addss	xmm7, xmm6
	addss	xmm5, xmm7

	movss	xmm7, dword [_ps_atan_s3]
	rcpss	xmm5, xmm5
	mulss	xmm5, xmm3
	movss	xmm3, dword [_ps_atan_t3]
	addss	xmm7, xmm6
	movss	xmm2, dword [_ps_am_sign_mask]
	mulss	xmm4, xmm3
	addss	xmm5, xmm7

	movss	xmm7, dword [_ps_am_pi_o_2]
	rcpss	xmm5, xmm5
	mulss	xmm5, xmm4

	andps	xmm0, xmm2
	orps	xmm0, xmm7
	subss	xmm0, xmm5
	ret

    l_small:
	movaps	xmm2, xmm0
	mulss	xmm2, xmm2

	movss	xmm1, dword [_ps_atan_s0]
	addss	xmm1, xmm2

	movss	xmm7, dword [_ps_atan_s1]
	rcpss	xmm1, xmm1
	mulss	xmm1, xmm3
	movss	xmm3, dword [_ps_atan_t1]
	addss	xmm7, xmm2
	addss	xmm1, xmm7
			
	movss	xmm7, dword [_ps_atan_s2]
	rcpss	xmm1, xmm1
	mulss	xmm1, xmm3
	movss	xmm3, dword [_ps_atan_t2]
	addss	xmm7, xmm2
	addss	xmm1, xmm7

	movss	xmm7, dword [_ps_atan_s3]
	rcpss	xmm1, xmm1
	mulss	xmm1, xmm3
	movss	xmm3, dword [_ps_atan_t3]
	addss	xmm7, xmm2
	mulss	xmm0, xmm3
	addss	xmm1, xmm7

	rcpss	xmm1, xmm1
	mulss	xmm0, xmm1

    addss xmm0, xmm0 ;this line is not part of atan 
    ret

    """

    avx_code = data + """

    #CODE
    global fast_acos_ss:
    vmovss xmm1, dword [_ps_am_1]
    vmovss xmm2, xmm2, xmm1
    vsubss xmm1, xmm1, xmm0
    vaddss xmm2, xmm2, xmm0
    vrcpss xmm1, xmm1, xmm1
    vmulss xmm2, xmm2, xmm1
    vrsqrtss xmm0, xmm0, xmm2

    ;atan
    vmovss	xmm1, dword [_ps_am_sign_mask]
	vrcpss	xmm4, xmm4, xmm0
	vorps	xmm1, xmm1, xmm0
	vmovss	xmm6, xmm6, xmm4
	vcomiss	xmm1, dword [_ps_am_m1]
	vmovss	xmm3, dword [_ps_atan_t0]
	jnc		l_small  ; 'c' is 'lt' for comiss

    ;l_big:
	vmulss	xmm6, xmm6, xmm6

	vmovss	xmm5, dword [_ps_atan_s0]
	vaddss	xmm5, xmm5, xmm6

	vmovss	xmm7, dword [_ps_atan_s1]
	vrcpss	xmm5, xmm5, xmm5
	vmulss	xmm5, xmm5, xmm3
	vmovss	xmm3, dword [_ps_atan_t1]
	vaddss	xmm7, xmm7, xmm6
	vaddss	xmm5, xmm5, xmm7

	vmovss	xmm7, dword [_ps_atan_s2]
	vrcpss	xmm5, xmm5, xmm5
	vmulss	xmm5, xmm5, xmm3
	vmovss	xmm3, dword [_ps_atan_t2]
	vaddss	xmm7, xmm7, xmm6
	vaddss	xmm5, xmm5, xmm7

	vmovss	xmm7, dword [_ps_atan_s3]
	vrcpss	xmm5, xmm5, xmm5
	vmulss	xmm5, xmm5, xmm3
	vmovss	xmm3, dword [_ps_atan_t3]
	vaddss	xmm7, xmm7, xmm6
	vmovss	xmm2, dword [_ps_am_sign_mask]
	vmulss	xmm4, xmm4, xmm3
	vaddss	xmm5, xmm5, xmm7

	vmovss	xmm7, dword [_ps_am_pi_o_2]
	vrcpss	xmm5, xmm5, xmm5
	vmulss	xmm5, xmm5, xmm4

	vandps	xmm0, xmm0, xmm2
	vorps	xmm0, xmm0, xmm7
	vsubss	xmm0, xmm0, xmm5
	ret

    l_small:
	vmovaps	xmm2, xmm0
	vmulss	xmm2, xmm2, xmm2

	vmovss	xmm1, dword [_ps_atan_s0]
	vaddss	xmm1, xmm1, xmm2

	vmovss	xmm7, dword [_ps_atan_s1]
	vrcpss	xmm1, xmm1, xmm1
	vmulss	xmm1, xmm1, xmm3
	vmovss	xmm3, dword [_ps_atan_t1]
	vaddss	xmm7, xmm7, xmm2
	vaddss	xmm1, xmm1, xmm7
			
	vmovss	xmm7, dword [_ps_atan_s2]
	vrcpss	xmm1, xmm1, xmm1
	vmulss	xmm1, xmm1, xmm3
	vmovss	xmm3, dword [_ps_atan_t2]
	vaddss	xmm7, xmm7, xmm2
	vaddss	xmm1, xmm1, xmm7

	vmovss	xmm7, dword [_ps_atan_s3]
	vrcpss	xmm1, xmm1, xmm1
	vmulss	xmm1, xmm1, xmm3
	vmovss	xmm3, dword [_ps_atan_t3]
	vaddss	xmm7, xmm7, xmm2
	vmulss	xmm0, xmm0, xmm3
	vaddss	xmm1, xmm1, xmm7

	vrcpss	xmm1, xmm1, xmm1
	vmulss	xmm0, xmm0, xmm1

    vaddss xmm0, xmm0, xmm0 ;this line is not part of atan 
    ret

    """
    asm = Tdasm()
    if proc.AVX:
        mc = asm.assemble(avx_code, True)
    else:
        mc = asm.assemble(asm_code, True)

    return mc
示例#38
0
def sincos_ss():
    data = """
    #DATA
    uint32 _ps_am_inv_sign_mask[4] = 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF
    uint32 _ps_am_sign_mask[4] = 0x80000000, 0x80000000, 0x80000000, 0x80000000
    float _ps_am_pi_o_2[4] = 1.57079632679, 1.57079632679, 1.57079632679, 1.57079632679
    float _ps_am_2_o_pi[4] = 0.63661977236, 0.63661977236, 0.63661977236, 0.63661977236
    uint32 _epi32_1[4] = 1, 1, 1, 1
    float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0
    uint32 _epi32_2[4] = 2, 2, 2, 2

    float _ps_sincos_p3[4] = -0.00468175413, -0.00468175413, -0.00468175413, -0.00468175413
    float _ps_sincos_p2[4] = 0.0796926262, 0.0796926262, 0.0796926262, 0.0796926262
    float _ps_sincos_p1[4] = -0.64596409750621,-0.64596409750621,-0.64596409750621,-0.64596409750621 
    float _ps_sincos_p0[4] = 1.570796326794896, 1.570796326794896, 1.570796326794896, 1.570796326794896
    """
    asm_code = data + """
    #CODE
    global fast_sincos_ss:
    movaps	xmm7, xmm0
    movss	xmm1, dword [_ps_am_inv_sign_mask]
    movss	xmm2, dword [_ps_am_sign_mask]
    movss	xmm3, dword [_ps_am_2_o_pi]
    andps	xmm0, xmm1
    andps	xmm7, xmm2
    mulss	xmm0, xmm3

    pxor	xmm3, xmm3
    movd	xmm5, dword [_epi32_1]
    movss	xmm4, dword [_ps_am_1]

    cvttps2dq	xmm2, xmm0
    pand	xmm5, xmm2
    movd	xmm1, dword [_epi32_2]
    pcmpeqd	xmm5, xmm3
    movd	xmm3, dword [_epi32_1]
    cvtdq2ps	xmm6, xmm2
    paddd	xmm3, xmm2
    pand	xmm2, xmm1
    pand	xmm3, xmm1
    subss	xmm0, xmm6
    pslld	xmm2, 30
    minss	xmm0, xmm4
    ;mov		eax, [esp + 4 + 16]
    ;mov		edx, [esp + 4 + 16 + 4]
    subss	xmm4, xmm0
    pslld	xmm3, 30

    movaps	xmm6, xmm4
    xorps	xmm2, xmm7
    movaps	xmm7, xmm5
    andps	xmm6, xmm7
    andnps	xmm7, xmm0
    andps	xmm0, xmm5
    andnps	xmm5, xmm4
    movss	xmm4, dword [_ps_sincos_p3]
    orps	xmm6, xmm7
    orps	xmm0, xmm5
    movss	xmm5, dword [_ps_sincos_p2]

    movaps	xmm1, xmm0
    movaps	xmm7, xmm6
    mulss	xmm0, xmm0
    mulss	xmm6, xmm6
    orps	xmm1, xmm2
    orps	xmm7, xmm3
    movaps	xmm2, xmm0
    movaps	xmm3, xmm6
    mulss	xmm0, xmm4
    mulss	xmm6, xmm4
    movss	xmm4, dword [_ps_sincos_p1]
    addss	xmm0, xmm5
    addss	xmm6, xmm5
    movss	xmm5, dword [_ps_sincos_p0]
    mulss	xmm0, xmm2
    mulss	xmm6, xmm3
    addss	xmm0, xmm4
    addss	xmm6, xmm4
    mulss	xmm0, xmm2
    mulss	xmm6, xmm3
    addss	xmm0, xmm5
    addss	xmm6, xmm5
    mulss	xmm0, xmm1
    mulss	xmm6, xmm7

    ;use full stores since caller might reload with full loads
    ;movaps	[eax], xmm0
    ;movaps	[edx], xmm6

    ret	
    """

    avx_code = data + """
    #CODE
    global fast_sincos_ss:
    vmovaps	xmm7, xmm0
    vmovss	xmm1, dword [_ps_am_inv_sign_mask]
    vmovss	xmm2, dword [_ps_am_sign_mask]
    vmovss	xmm3, dword [_ps_am_2_o_pi]
    vandps	xmm0, xmm0, xmm1
    vandps	xmm7, xmm7, xmm2
    vmulss	xmm0, xmm0, xmm3

    vpxor	xmm3, xmm3, xmm3
    vmovd	xmm5, dword [_epi32_1]
    vmovss	xmm4, dword [_ps_am_1]

    vcvttps2dq	xmm2, xmm0
    vpand	xmm5, xmm5, xmm2
    vmovd	xmm1, dword [_epi32_2]
    vpcmpeqd	xmm5, xmm5, xmm3
    vmovd	xmm3, dword [_epi32_1]
    vcvtdq2ps	xmm6, xmm2
    vpaddd	xmm3, xmm3, xmm2
    vpand	xmm2, xmm2, xmm1
    vpand	xmm3, xmm3, xmm1
    vsubss	xmm0, xmm0, xmm6
    vpslld	xmm2, xmm2, 30
    vminss	xmm0, xmm0, xmm4
    ;mov		eax, [esp + 4 + 16]
    ;mov		edx, [esp + 4 + 16 + 4]
    vsubss	xmm4, xmm4, xmm0
    vpslld	xmm3, xmm3, 30

    vmovaps	xmm6, xmm4
    vxorps	xmm2, xmm2, xmm7
    vmovaps	xmm7, xmm5
    vandps	xmm6, xmm6, xmm7
    vandnps	xmm7, xmm7, xmm0
    vandps	xmm0, xmm0, xmm5
    vandnps	xmm5, xmm5, xmm4
    vmovss	xmm4, dword [_ps_sincos_p3]
    vorps	xmm6, xmm6, xmm7
    vorps	xmm0, xmm0, xmm5
    vmovss	xmm5, dword [_ps_sincos_p2]

    vmovaps	xmm1, xmm0
    vmovaps	xmm7, xmm6
    vmulss	xmm0, xmm0, xmm0
    vmulss	xmm6, xmm6, xmm6
    vorps	xmm1, xmm1, xmm2
    vorps	xmm7, xmm7, xmm3
    vmovaps	xmm2, xmm0
    vmovaps	xmm3, xmm6
    vmulss	xmm0, xmm0, xmm4
    vmulss	xmm6, xmm6, xmm4
    vmovss	xmm4, dword [_ps_sincos_p1]
    vaddss	xmm0, xmm0, xmm5
    vaddss	xmm6, xmm6, xmm5
    vmovss	xmm5, dword [_ps_sincos_p0]
    vmulss	xmm0, xmm0, xmm2
    vmulss	xmm6, xmm6, xmm3
    vaddss	xmm0, xmm0, xmm4
    vaddss	xmm6, xmm6, xmm4
    vmulss	xmm0, xmm0, xmm2
    vmulss	xmm6, xmm6, xmm3
    vaddss	xmm0, xmm0, xmm5
    vaddss	xmm6, xmm6, xmm5
    vmulss	xmm0, xmm0, xmm1
    vmulss	xmm6, xmm6, xmm7

    ;use full stores since caller might reload with full loads
    ;movaps	[eax], xmm0
    ;movaps	[edx], xmm6

    ret	
    """

    asm = Tdasm()
    if proc.AVX:
        mc = asm.assemble(avx_code, True)
    else:
        mc = asm.assemble(asm_code, True)
    
    return mc
示例#39
0
文件: cosps.py 项目: mario007/renmas
def cos_ps():

    data = """
    #DATA
    uint32 _ps_am_inv_sign_mask[4] = 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF
    float _ps_am_pi_o_2[4] = 1.57079632679, 1.57079632679, 1.57079632679, 1.57079632679
    float _ps_am_2_o_pi[4] = 0.63661977236, 0.63661977236, 0.63661977236, 0.63661977236
    uint32 _epi32_1[4] = 1, 1, 1, 1
    float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0
    uint32 _epi32_2[4] = 2, 2, 2, 2

    float _ps_sincos_p3[4] = -0.00468175413, -0.00468175413, -0.00468175413, -0.00468175413
    float _ps_sincos_p2[4] = 0.0796926262, 0.0796926262, 0.0796926262, 0.0796926262
    float _ps_sincos_p1[4] = -0.64596409750621,-0.64596409750621,-0.64596409750621,-0.64596409750621 
    float _ps_sincos_p0[4] = 1.570796326794896, 1.570796326794896, 1.570796326794896, 1.570796326794896
    """

    asm_code = data + """

    #CODE
    global fast_cos_ps:
    andps	xmm0, oword [_ps_am_inv_sign_mask]
    addps	xmm0, oword [_ps_am_pi_o_2]
    mulps	xmm0, oword [_ps_am_2_o_pi]

    pxor	xmm3, xmm3
    movdqa	xmm5, oword [_epi32_1]
    movaps	xmm4, oword [_ps_am_1]
    cvttps2dq	xmm2, xmm0
    pand	xmm5, xmm2
    pcmpeqd	xmm5, xmm3
    cvtdq2ps	xmm6, xmm2
    pand	xmm2, oword [_epi32_2]
    pslld	xmm2, 30 

    subps	xmm0, xmm6
    minps	xmm0, xmm4
    subps	xmm4, xmm0
    andps	xmm0, xmm5
    andnps	xmm5, xmm4
    orps	xmm0, xmm5

    movaps	xmm1, xmm0
    mulps	xmm0, xmm0
    orps	xmm1, xmm2
    movaps	xmm7, xmm0
    mulps	xmm0, oword [_ps_sincos_p3]
    addps	xmm0, oword [_ps_sincos_p2]
    mulps	xmm0, xmm7
    addps	xmm0, oword [_ps_sincos_p1]
    mulps	xmm0, xmm7
    addps	xmm0, oword [_ps_sincos_p0]
    mulps	xmm0, xmm1
    ret
    """

    avx_code = data + """

    #CODE
    global fast_cos_ps:
    vandps	xmm0, xmm0, oword [_ps_am_inv_sign_mask]
    vaddps	xmm0, xmm0, oword [_ps_am_pi_o_2]
    vmulps	xmm0, xmm0, oword [_ps_am_2_o_pi]

    vpxor	xmm3, xmm3, xmm3
    vmovdqa	xmm5, oword [_epi32_1]
    vmovaps	xmm4, oword [_ps_am_1]
    vcvttps2dq	xmm2, xmm0
    vpand	xmm5, xmm5, xmm2
    vpcmpeqd	xmm5, xmm5, xmm3
    vcvtdq2ps	xmm6, xmm2
    vpand	xmm2, xmm2, oword [_epi32_2]
    vpslld	xmm2, xmm2, 30 

    vsubps	xmm0, xmm0, xmm6
    vminps	xmm0, xmm0, xmm4
    vsubps	xmm4, xmm4, xmm0
    vandps	xmm0, xmm0, xmm5
    vandnps	xmm5, xmm5, xmm4
    vorps	xmm0, xmm0, xmm5

    vmovaps	xmm1, xmm0
    vmulps	xmm0, xmm0, xmm0
    vorps	xmm1, xmm1, xmm2
    vmovaps	xmm7, xmm0
    vmulps	xmm0, xmm0, oword [_ps_sincos_p3]
    vaddps	xmm0, xmm0, oword [_ps_sincos_p2]
    vmulps	xmm0, xmm0, xmm7
    vaddps	xmm0, xmm0, oword [_ps_sincos_p1]
    vmulps	xmm0, xmm0, xmm7
    vaddps	xmm0, xmm0, oword [_ps_sincos_p0]
    vmulps	xmm0, xmm0, xmm1
    ret
    """

    asm = Tdasm()
    if proc.AVX:
        mc = asm.assemble(avx_code, True)
    else:
        mc = asm.assemble(asm_code, True)

    return mc
示例#40
0
文件: rng.py 项目: mario007/renmas
    def random_float(cls, runtime, label):
        asm_code = cls.data + """
        #CODE
        """
        asm_code += " global " + label + ": " + """
            movdqa xmm0, oword [gadd]
            movdqa xmm1, oword [mult]
            movdqa xmm2, oword [mask]


            pshufd xmm4, oword [cur_seed], 10110001b
            movdqa xmm5, oword [cur_seed]
            pmuludq xmm5, xmm1
            pshufd xmm1, xmm1, 10110001b
            pmuludq xmm4, xmm1
            pand xmm5, xmm2
            pand xmm4, xmm2
            pshufd xmm4, xmm4, 10110001b
            por xmm5, xmm4
            paddd xmm5, xmm0

            movdqa oword [cur_seed], xmm5

            ;convert to float
            pand xmm5, oword [_random_sign_mask]
            cvtdq2ps xmm0, xmm5
            mulps xmm0, oword [_random_flt]

            ret

        """

        avx_code = cls.data + """
        #CODE
        """
        avx_code += " global " + label + ": " + """
            vmovdqa xmm0, oword [gadd]
            vmovdqa xmm1, oword [mult]
            vmovdqa xmm2, oword [mask]


            vpshufd xmm4, oword [cur_seed], 10110001b
            vmovdqa xmm5, oword [cur_seed]
            vpmuludq xmm5, xmm5, xmm1
            vpshufd xmm1, xmm1, 10110001b
            vpmuludq xmm4, xmm4, xmm1
            vpand xmm5, xmm5, xmm2
            vpand xmm4, xmm4, xmm2
            vpshufd xmm4, xmm4, 10110001b
            vpor xmm5, xmm5, xmm4
            vpaddd xmm5, xmm5, xmm0

            vmovdqa oword [cur_seed], xmm5

            ;convert to float
            vpand xmm5, xmm5, oword [_random_sign_mask]
            vcvtdq2ps xmm0, xmm5
            vmulps xmm0, xmm0, oword [_random_flt]
            
            ret

        """
        
        asm = Tdasm()
        if util.AVX:
            mc = asm.assemble(avx_code, True)
        else:
            mc = asm.assemble(asm_code, True)
        
        name = "randomfloat" + str(util.unique())
        ds = runtime.load(name, mc)
        v1 = random.randint(0, 4000000000) 
        v2 = random.randint(0, 4000000000) 
        v3 = random.randint(0, 4000000000) 
        v4 = random.randint(0, 4000000000) 
        ds['cur_seed'] = (v1, v2, v3, v4) 
示例#41
0
文件: logps.py 项目: mario007/renmas
def log_ps():
    data = """
    #DATA
    float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0
    uint32 _ps_am_min_norm_pos[4] = 0x00800000, 0x00800000, 0x00800000, 0x00800000
    uint32 _ps_am_inv_mant_mask[4] = 0x807FFFFF, 0x807FFFFF, 0x807FFFFF, 0x807FFFFF 
    uint32 _epi32_0x7f[4] = 0x7F, 0x7F, 0x7F, 0x7F
    float _ps_log_p0[4] = -0.789580278884, -0.789580278884, -0.789580278884, -0.789580278884
    float _ps_log_q0[4] = -35.6722798256, -35.6722798256, -35.6722798256, -35.6722798256
    float _ps_log_p1[4] = 16.38666456995, 16.38666456995, 16.38666456995, 16.38666456995
    float _ps_log_q1[4] = 312.0937663722, 312.0937663722, 312.0937663722, 312.0937663722
    float _ps_log_p2[4] = -64.14099529587, -64.14099529587, -64.14099529587, -64.14099529587
    float _ps_log_q2[4] = -769.69194355046, -769.69194355046, -769.69194355046, -769.69194355046
    float _ps_log_c0[4] = 0.6931471805599, 0.6931471805599, 0.6931471805599, 0.6931471805599 

    """
    asm_code = data + """

    #CODE
    global fast_log_ps:
    maxps	xmm0, oword [_ps_am_min_norm_pos]  ; cut off denormalized stuff
    movaps	xmm1, oword [_ps_am_1]
    movaps	xmm3, xmm0

    andps	xmm0, oword [_ps_am_inv_mant_mask]
    orps	xmm0, xmm1

    movaps	xmm4, xmm0
    subps	xmm0, xmm1
    addps	xmm4, xmm1
    psrld	xmm3, 23
    rcpps	xmm4, xmm4
    mulps	xmm0, xmm4
    psubd	xmm3, oword [_epi32_0x7f]
    addps	xmm0, xmm0

    movaps	xmm2, xmm0
    mulps	xmm0, xmm0

    movaps	xmm4, oword [_ps_log_p0]
    movaps	xmm6, oword [_ps_log_q0]

    mulps	xmm4, xmm0
    movaps	xmm5, oword [_ps_log_p1]
    mulps	xmm6, xmm0
    movaps	xmm7, oword [_ps_log_q1]

    addps	xmm4, xmm5
    addps	xmm6, xmm7

    movaps	xmm5, oword [_ps_log_p2]
    mulps	xmm4, xmm0
    movaps	xmm7, oword [_ps_log_q2]
    mulps	xmm6, xmm0

    addps	xmm4, xmm5
    movaps	xmm5, oword [_ps_log_c0]
    addps	xmm6, xmm7
    cvtdq2ps	xmm1, xmm3

    mulps	xmm0, xmm4
    rcpps	xmm6, xmm6

    mulps	xmm0, xmm6
    mulps	xmm0, xmm2

    mulps	xmm1, xmm5

    addps	xmm0, xmm2
    addps	xmm0, xmm1

    ret	


    """

    avx_code = data + """

    #CODE
    global fast_log_ps:
    vmaxps	xmm0, xmm0, oword [_ps_am_min_norm_pos]  ; cut off denormalized stuff
    vmovaps	xmm1, oword [_ps_am_1]
    vmovaps	xmm3, xmm0

    vandps	xmm0, xmm0, oword [_ps_am_inv_mant_mask]
    vorps	xmm0, xmm0, xmm1

    vmovaps	xmm4, xmm0
    vsubps	xmm0, xmm0, xmm1
    vaddps	xmm4, xmm4, xmm1
    vpsrld	xmm3, xmm3, 23
    vrcpps	xmm4, xmm4
    vmulps	xmm0, xmm0, xmm4
    vpsubd	xmm3, xmm3, oword [_epi32_0x7f]
    vaddps	xmm0, xmm0, xmm0

    vmovaps	xmm2, xmm0
    vmulps	xmm0, xmm0, xmm0

    vmovaps	xmm4, oword [_ps_log_p0]
    vmovaps	xmm6, oword [_ps_log_q0]

    vmulps	xmm4, xmm4, xmm0
    vmovaps	xmm5, oword [_ps_log_p1]
    vmulps	xmm6, xmm6, xmm0
    vmovaps	xmm7, oword [_ps_log_q1]

    vaddps	xmm4, xmm4, xmm5
    vaddps	xmm6, xmm6, xmm7

    vmovaps	xmm5, oword [_ps_log_p2]
    vmulps	xmm4, xmm4, xmm0
    vmovaps	xmm7, oword [_ps_log_q2]
    vmulps	xmm6, xmm6, xmm0

    vaddps	xmm4, xmm4, xmm5
    vmovaps	xmm5, oword [_ps_log_c0]
    vaddps	xmm6, xmm6, xmm7
    vcvtdq2ps	xmm1, xmm3

    vmulps	xmm0, xmm0, xmm4
    vrcpps	xmm6, xmm6

    vmulps	xmm0, xmm0, xmm6
    vmulps	xmm0, xmm0, xmm2

    vmulps	xmm1, xmm1, xmm5

    vaddps	xmm0, xmm0, xmm2
    vaddps	xmm0, xmm0, xmm1

    ret	


    """

    asm = Tdasm()
    if proc.AVX:
        mc = asm.assemble(avx_code, True)
    else:
        mc = asm.assemble(asm_code, True)

    return mc
示例#42
0
文件: tanps.py 项目: mario007/renmas
def tan_ps():
    data = """
    #DATA
    uint32 _ps_am_inv_sign_mask[4] = 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF
    float _ps_am_4_o_pi[4] = 1.273239544735, 1.273239544735, 1.273239544735, 1.273239544735
    float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0
    float _ps_am_pi_o_4[4] = 0.78539816339, 0.78539816339, 0.78539816339, 0.78539816339
    int32 _epi32_1[4] = 1, 1, 1, 1
    int32 _epi32_7[4] = 7, 7, 7, 7
    int32 _epi32_2[4] = 2, 2, 2, 2
    uint32 _ps_am_sign_mask[4] = 0x80000000, 0x80000000, 0x80000000, 0x80000000
    float _ps_tan_p0[4] = -17956525.197648, -17956525.197648, -17956525.197648, -17956525.197648 
    float _ps_tan_q0[4] = -53869575.592945, -53869575.592945, -53869575.592945, -53869575.592945 
    float _ps_tan_p1[4] = 1153516.64838587, 1153516.64838587, 1153516.64838587, 1153516.64838587
    float _ps_tan_q1[4] = 25008380.18233579, 25008380.18233579, 25008380.18233579, 25008380.18233579
    float _ps_tan_p2[4] = -13093.693918138, -13093.693918138, -13093.693918138, -13093.693918138
    float _ps_tan_q2[4] = -1320892.3444021, -1320892.3444021, -1320892.3444021, -1320892.3444021
    float _ps_tan_q3[4] = 13681.296347069, 13681.296347069, 13681.296347069, 13681.296347069
    float _ps_tan_poleval[4] = 36893500000000000000.0, 36893500000000000000.0, 36893500000000000000.0, 36893500000000000000.0

    """
    asm_code = data + """

    #CODE
    global fast_tan_ps:
    movaps	xmm7, xmm0
    andps	xmm0, oword [_ps_am_inv_sign_mask]
    andps	xmm7, oword [_ps_am_sign_mask]
    movaps	xmm1, xmm0
    mulps	xmm0, oword [_ps_am_4_o_pi]

    cvttps2dq	xmm0, xmm0
    movdqa	xmm4, oword [_epi32_1]
    movdqa	xmm5, oword [_epi32_7]

    pand	xmm4, xmm0
    pand	xmm5, xmm0
    movaps	xmm3, oword [_ps_am_1]
    paddd	xmm0, xmm4
    paddd	xmm5, xmm4

    cvtdq2ps	xmm0, xmm0

    mulps	xmm0, oword [_ps_am_pi_o_4]
    xorps	xmm6, xmm6
    subps	xmm1, xmm0
    movaps	xmm2, oword [_ps_tan_p2]
    minps	xmm1, xmm3
    movaps	xmm3, oword [_ps_tan_q3]
    movaps	xmm0, xmm1
    mulps	xmm1, xmm1

    mulps	xmm2, xmm1
    addps	xmm3, xmm1
    addps	xmm2, oword [_ps_tan_p1]
    mulps	xmm3, xmm1
    mulps	xmm2, xmm1
    addps	xmm3, oword [_ps_tan_q2]
    addps	xmm2, oword [_ps_tan_p0]
    mulps	xmm3, xmm1
    mulps	xmm2, xmm1
    addps	xmm3, oword [_ps_tan_q1]
    xorps	xmm0, xmm7
    mulps	xmm3, xmm1
    pand	xmm5, oword [_epi32_2]
    addps	xmm3, oword [_ps_tan_q0]
    mulps	xmm2, xmm0

    cmpps xmm6, xmm1, 4
    rcpps	xmm4, xmm3
    pxor	xmm7, xmm7
    mulps	xmm3, xmm4
    pcmpeqd	xmm5, xmm7
    mulps	xmm3, xmm4
    addps	xmm4, xmm4
    orps	xmm6, xmm5
    subps	xmm4, xmm3

    mulps	xmm2, xmm4
    movaps	xmm1, oword [_ps_am_sign_mask]
    movmskps	eax, xmm6
    addps	xmm2, xmm0

    rcpps	xmm4, xmm2
    cmp		eax, 0xf
    movaps	xmm0, xmm2
    mulps	xmm2, xmm4
    mulps	xmm2, xmm4
    addps	xmm4, xmm4
    subps	xmm4, xmm2
    jne		l_pole

    xorps	xmm4, xmm1

    andps	xmm0, xmm5
    andnps	xmm5, xmm4
    orps	xmm0, xmm5

    ret	

    l_pole:
    movaps	xmm7, xmm1
    movaps	xmm3, oword [_ps_tan_poleval]
    andps	xmm1, xmm0
    orps	xmm3, xmm1
    andps	xmm4, xmm6
    andnps	xmm6, xmm3
    orps	xmm4, xmm6

    xorps	xmm4, xmm7

    andps	xmm0, xmm5
    andnps	xmm5, xmm4
    orps	xmm0, xmm5

    ret	


    """

    avx_code = data + """

    #CODE
    global fast_tan_ps:
    vmovaps	xmm7, xmm0
    vandps	xmm0, xmm0, oword [_ps_am_inv_sign_mask]
    vandps	xmm7, xmm7, oword [_ps_am_sign_mask]
    vmovaps	xmm1, xmm0
    vmulps	xmm0, xmm0, oword [_ps_am_4_o_pi]

    vcvttps2dq	xmm0, xmm0
    vmovdqa	xmm4, oword [_epi32_1]
    vmovdqa	xmm5, oword [_epi32_7]

    vpand	xmm4, xmm4, xmm0
    vpand	xmm5, xmm5, xmm0
    vmovaps	xmm3, oword [_ps_am_1]
    vpaddd	xmm0, xmm0, xmm4
    vpaddd	xmm5, xmm5, xmm4

    vcvtdq2ps	xmm0, xmm0

    vmulps	xmm0, xmm0, oword [_ps_am_pi_o_4]
    vxorps	xmm6, xmm6, xmm6
    vsubps	xmm1, xmm1, xmm0
    vmovaps	xmm2, oword [_ps_tan_p2]
    vminps	xmm1, xmm1, xmm3
    vmovaps	xmm3, oword [_ps_tan_q3]
    vmovaps	xmm0, xmm1
    vmulps	xmm1, xmm1, xmm1

    vmulps	xmm2, xmm2, xmm1
    vaddps	xmm3, xmm3, xmm1
    vaddps	xmm2, xmm2, oword [_ps_tan_p1]
    vmulps	xmm3, xmm3, xmm1
    vmulps	xmm2, xmm2, xmm1
    vaddps	xmm3, xmm3, oword [_ps_tan_q2]
    vaddps	xmm2, xmm2, oword [_ps_tan_p0]
    vmulps	xmm3, xmm3, xmm1
    vmulps	xmm2, xmm2, xmm1
    vaddps	xmm3, xmm3, oword [_ps_tan_q1]
    vxorps	xmm0, xmm0, xmm7
    vmulps	xmm3, xmm3, xmm1
    vpand	xmm5, xmm5, oword [_epi32_2]
    vaddps	xmm3, xmm3, oword [_ps_tan_q0]
    vmulps	xmm2, xmm2, xmm0

    vcmpps xmm6, xmm6, xmm1, 4
    vrcpps	xmm4, xmm3
    vpxor	xmm7, xmm7, xmm7
    vmulps	xmm3, xmm3, xmm4
    vpcmpeqd	xmm5, xmm5, xmm7
    vmulps	xmm3, xmm3, xmm4
    vaddps	xmm4, xmm4, xmm4
    vorps	xmm6, xmm6, xmm5
    vsubps	xmm4, xmm4, xmm3

    vmulps	xmm2, xmm2, xmm4
    vmovaps	xmm1, oword [_ps_am_sign_mask]
    vmovmskps	eax, xmm6
    vaddps	xmm2, xmm2, xmm0

    vrcpps	xmm4, xmm2
    cmp		eax, 0xf
    vmovaps	xmm0, xmm2
    vmulps	xmm2, xmm2, xmm4
    vmulps	xmm2, xmm2, xmm4
    vaddps	xmm4, xmm4, xmm4
    vsubps	xmm4, xmm4, xmm2
    jne		l_pole

    vxorps	xmm4, xmm4, xmm1

    vandps	xmm0, xmm0, xmm5
    vandnps	xmm5, xmm5, xmm4
    vorps	xmm0, xmm0, xmm5

    ret	

    l_pole:
    vmovaps	xmm7, xmm1
    vmovaps	xmm3, oword [_ps_tan_poleval]
    vandps	xmm1, xmm1, xmm0
    vorps	xmm3, xmm3, xmm1
    vandps	xmm4, xmm4, xmm6
    vandnps	xmm6, xmm6, xmm3
    vorps	xmm4, xmm4, xmm6

    vxorps	xmm4, xmm4, xmm7

    vandps	xmm0, xmm0, xmm5
    vandnps	xmm5, xmm5, xmm4
    vorps	xmm0, xmm0, xmm5

    ret	


    """

    asm = Tdasm()
    if proc.AVX:
        mc = asm.assemble(avx_code, True)
    else:
        mc = asm.assemble(asm_code, True)

    return mc
示例#43
0
        _endblt:
        ret 
    """
    return code


def _blt_rgba_to_prgba_code():
    bits = platform.architecture()[0]
    if bits == '64bit':
        return _blt_floatrgba_code64()
    else:
        return _blt_floatrgba_code32()


_asm = Tdasm()
_mc = _asm.assemble(_blt_rgba_to_prgba_code())
_runtime = Runtime()
_data_section = _runtime.load("blt_rgba_to_prgba", _mc)


def blt_rgba_to_prgba(src, dest):

    assert isinstance(src, ImageRGBA)
    assert isinstance(dest, ImagePRGBA)

    #TODO blt only part of image
    sa, spitch = src.address_info()
    da, dpitch = dest.address_info()
    dx = dy = sx = sy = 0
    sw, sh = src.size()
示例#44
0
文件: powps.py 项目: mario007/renmas
def pow_ps():

    bits = platform.architecture()[0]

    data = """
    #DATA

    uint32 _epi32_1[4] = 1, 1, 1, 1
    float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0

    uint32 _ps_am_inv_mant_mask[4] = 0x807FFFFF,  0x807FFFFF,  0x807FFFFF,  0x807FFFFF 
    uint32 _ps_am_min_norm_pos[4] = 0x00800000, 0x00800000, 0x00800000, 0x00800000
    uint32 _epi32_0x7f[4] = 0x7F, 0x7F, 0x7F, 0x7F

    float _ps_log_p0[4] = -0.7895802788, -0.7895802788, -0.7895802788, -0.7895802788
    float _ps_log_q0[4] = -35.6722798256, -35.6722798256, -35.6722798256, -35.6722798256
    float _ps_log_p1[4] = 16.3866645699,  16.3866645699,  16.3866645699,  16.3866645699
    float _ps_log_q1[4] = 312.093766372,  312.093766372,  312.093766372,  312.093766372
    float _ps_log_p2[4] = -64.14099529,  -64.14099529,  -64.14099529,  -64.14099529
    float _ps_log_q2[4] = -769.691943550,  -769.691943550,  -769.691943550,  -769.691943550

    float _ps_log2_c0[4] = 1.442695040,  1.442695040,  1.442695040,  1.442695040
    float _ps_exp2_hi[4] = 127.4999961,  127.4999961,  127.4999961,  127.4999961
    float _ps_exp2_lo[4] = -127.4999961,  -127.4999961,  -127.4999961,  -127.4999961
    float _ps_am_0p5[4] = 0.5, 0.5, 0.5, 0.5
    float _ps_exp2_p0[4] = 0.0230933477, 0.0230933477, 0.0230933477, 0.0230933477
    float _ps_exp2_q0[4] = 233.18421172, 233.18421172, 233.18421172, 233.18421172 
    float _ps_exp2_p1[4] = 20.202065669,  20.202065669,  20.202065669,  20.202065669
    float _ps_exp2_q1[4] = 4368.211668, 4368.211668, 4368.211668, 4368.211668
    float _ps_exp2_p2[4] = 1513.90680, 1513.90680, 1513.90680, 1513.90680

    """
    asm_code = data + """

    #CODE
    global fast_pow_ps:
	xorps	xmm5, xmm5
	cmpps xmm5, xmm0, 1
    """
    if bits == '64bit':
	    asm_code += "mov rax, rsp\n"
    else:
	    asm_code += "mov eax, esp\n"
    
    asm_code += """
	maxps	xmm0, oword [ _ps_am_min_norm_pos]  ;// cut off denormalized stuff
	movaps	xmm7, oword [_ps_am_1]
	movaps	xmm3, xmm0
    """

    if bits == '64bit':
	    asm_code += "and rax, 0xFFFFFFF0\n"
    else:
	    asm_code += "and eax, 0xFFFFFFF0\n"

    asm_code += """
	andps	xmm0, oword [_ps_am_inv_mant_mask]
	orps	xmm0, xmm7
    """
    if bits == '64bit':
	    asm_code += "movaps oword [rax - 16], xmm5\n"
    else:
	    asm_code += "movaps oword [eax - 16], xmm5\n"
    
    asm_code += """
	movaps	xmm4, xmm0
	subps	xmm0, xmm7
	addps	xmm4, xmm7
	psrld	xmm3, 23
	rcpps	xmm4, xmm4
	mulps	xmm0, xmm4
	psubd	xmm3, oword [_epi32_0x7f]
	addps	xmm0, xmm0

	movaps	xmm2, xmm0
	mulps	xmm0, xmm0

	movaps	xmm4, oword [_ps_log_p0]
	movaps	xmm6, oword [_ps_log_q0]

	mulps	xmm4, xmm0
	movaps	xmm5, oword [_ps_log_p1]
	mulps	xmm6, xmm0
	movaps	xmm7, oword [_ps_log_q1]

	addps	xmm4, xmm5
	addps	xmm6, xmm7

	movaps	xmm5, oword [_ps_log_p2]
	mulps	xmm4, xmm0
	movaps	xmm7, oword [_ps_log_q2]
	mulps	xmm6, xmm0

	addps	xmm4, xmm5
	movaps	xmm5, oword [_ps_log2_c0]
	addps	xmm6, xmm7
	cvtdq2ps	xmm7, xmm3

	mulps	xmm0, xmm4
	rcpps	xmm6, xmm6

	mulps	xmm0, xmm6
	movaps	xmm4, oword [_ps_exp2_hi]
	mulps	xmm0, xmm2
	movaps	xmm6, oword [_ps_exp2_lo]
	mulps	xmm2, xmm5
	mulps	xmm0, xmm5
	addps	xmm2, xmm7
	movaps	xmm3, oword [_ps_am_0p5]
	addps	xmm0, xmm2
	xorps	xmm2, xmm2

	mulps	xmm0, xmm1

	minps	xmm0, xmm4
	movaps	xmm4, oword [_ps_exp2_p0]
	maxps	xmm0, xmm6
	movaps	xmm6, oword [_ps_exp2_q0]

	addps	xmm3, xmm0

	cmpps xmm2, xmm3, 5
	pand	xmm2, oword [_epi32_1]

	cvttps2dq	xmm3, xmm3

	psubd	xmm3, xmm2
	movaps	xmm5, oword [_ps_exp2_p1]

	cvtdq2ps	xmm2, xmm3
	movaps	xmm7, oword [_ps_exp2_q1]

	subps	xmm0, xmm2

	movaps	xmm2, xmm0
	mulps	xmm0, xmm0

	paddd	xmm3, oword [_epi32_0x7f]

	mulps	xmm4, xmm0
	mulps	xmm6, xmm0
	addps	xmm4, xmm5
	addps	xmm6, xmm7

	mulps	xmm4, xmm0
    """
    if bits == '64bit':
	    asm_code += "movaps xmm5, oword [rax - 16]\n"
    else:
	    asm_code += "movaps xmm5, oword [eax - 16]\n"
    asm_code += """
	pslld	xmm3, 23
	addps	xmm4, oword [_ps_exp2_p2]

	mulps	xmm2, xmm4

	movaps	xmm0, oword [_ps_am_1]
	subps	xmm6, xmm2
	andps	xmm3, xmm5
	rcpps	xmm6, xmm6
	mulps	xmm2, xmm6
	addps	xmm2, xmm2
	addps	xmm0, xmm2

	mulps	xmm0, xmm3
    ret
    """
    
    avx_code = data + """

    #CODE
    global fast_pow_ps:
	vxorps	xmm5, xmm5, xmm5
	vcmpps xmm5, xmm5, xmm0, 1
    """
    if bits == '64bit':
	    avx_code += "mov rax, rsp\n"
    else:
	    avx_code += "mov eax, esp\n"

    avx_code += """
	vmaxps	xmm0, xmm0, oword [ _ps_am_min_norm_pos]  ;// cut off denormalized stuff
	vmovaps	xmm7, oword [_ps_am_1]
	vmovaps	xmm3, xmm0
    """
    if bits == '64bit':
	    avx_code += "and rax, 0xFFFFFFF0\n"
    else:
	    avx_code += "and eax, 0xFFFFFFF0\n"

    avx_code += """
	vandps	xmm0, xmm0, oword [_ps_am_inv_mant_mask]
	vorps	xmm0, xmm0, xmm7
    """
    if bits == '64bit':
	    avx_code += "vmovaps oword [rax - 16], xmm5\n"
    else:
	    avx_code += "vmovaps oword [eax - 16], xmm5\n"

    avx_code += """
	vmovaps	xmm4, xmm0
	vsubps	xmm0, xmm0, xmm7
	vaddps	xmm4, xmm4, xmm7
	vpsrld	xmm3, xmm3, 23
	vrcpps	xmm4, xmm4
	vmulps	xmm0, xmm0, xmm4
	vpsubd	xmm3, xmm3, oword [_epi32_0x7f]
	vaddps	xmm0, xmm0, xmm0

	vmovaps	xmm2, xmm0
	vmulps	xmm0, xmm0, xmm0

	vmovaps	xmm4, oword [_ps_log_p0]
	vmovaps	xmm6, oword [_ps_log_q0]

	vmulps	xmm4, xmm4, xmm0
	vmovaps	xmm5, oword [_ps_log_p1]
	vmulps	xmm6, xmm6, xmm0
	vmovaps	xmm7, oword [_ps_log_q1]

	vaddps	xmm4, xmm4, xmm5
	vaddps	xmm6, xmm6, xmm7

	vmovaps	xmm5, oword [_ps_log_p2]
	vmulps	xmm4, xmm4, xmm0
	vmovaps	xmm7, oword [_ps_log_q2]
	vmulps	xmm6, xmm6, xmm0

	vaddps	xmm4, xmm4, xmm5
	vmovaps	xmm5, oword [_ps_log2_c0]
	vaddps	xmm6, xmm6, xmm7
	vcvtdq2ps	xmm7, xmm3

	vmulps	xmm0, xmm0, xmm4
	vrcpps	xmm6, xmm6

	vmulps	xmm0, xmm0, xmm6
	vmovaps	xmm4, oword [_ps_exp2_hi]
	vmulps	xmm0, xmm0, xmm2
	vmovaps	xmm6, oword [_ps_exp2_lo]
	vmulps	xmm2, xmm2, xmm5
	vmulps	xmm0, xmm0, xmm5
	vaddps	xmm2, xmm2, xmm7
	vmovaps	xmm3, oword [_ps_am_0p5]
	vaddps	xmm0, xmm0, xmm2
	vxorps	xmm2, xmm2, xmm2

	vmulps	xmm0, xmm0, xmm1

	vminps	xmm0, xmm0, xmm4
	vmovaps	xmm4, oword [_ps_exp2_p0]
	vmaxps	xmm0, xmm0, xmm6
	vmovaps	xmm6, oword [_ps_exp2_q0]

	vaddps	xmm3, xmm3, xmm0

	vcmpps xmm2, xmm2, xmm3, 5
	vpand	xmm2, xmm2, oword [_epi32_1]

	vcvttps2dq	xmm3, xmm3

	vpsubd	xmm3, xmm3, xmm2
	vmovaps	xmm5, oword [_ps_exp2_p1]

	vcvtdq2ps	xmm2, xmm3
	vmovaps	xmm7, oword [_ps_exp2_q1]

	vsubps	xmm0, xmm0, xmm2

	vmovaps	xmm2, xmm0
	vmulps	xmm0, xmm0, xmm0

	vpaddd	xmm3, xmm3, oword [_epi32_0x7f]

	vmulps	xmm4, xmm4, xmm0
	vmulps	xmm6, xmm6, xmm0
	vaddps	xmm4, xmm4, xmm5
	vaddps	xmm6, xmm6, xmm7

	vmulps	xmm4, xmm4, xmm0
    """
    if bits == '64bit':
	    avx_code += "vmovaps xmm5, oword [rax - 16]\n"
    else:
	    avx_code += "vmovaps xmm5, oword [eax - 16]\n"

    avx_code += """
	vpslld	xmm3, xmm3, 23
	vaddps	xmm4, xmm4, oword [_ps_exp2_p2]

	vmulps	xmm2, xmm2, xmm4

	vmovaps	xmm0, oword [_ps_am_1]
	vsubps	xmm6, xmm6, xmm2
	vandps	xmm3, xmm3, xmm5
	vrcpps	xmm6, xmm6
	vmulps	xmm2, xmm2, xmm6
	vaddps	xmm2, xmm2, xmm2
	vaddps	xmm0, xmm0, xmm2

	vmulps	xmm0, xmm0, xmm3
    ret
    """

    asm = Tdasm()
    if proc.AVX:
        mc = asm.assemble(avx_code, True)
    else:
        mc = asm.assemble(asm_code, True)

    return mc
示例#45
0
def random_float():
    data = """
            #DATA
            uint32 cur_seed[4]
            uint32 mult[4] = 214013, 17405, 214013, 69069
            uint32 gadd[4] = 2531011, 10395331, 13737667, 1
            uint32 mask[4] = 0xFFFFFFFF, 0, 0xFFFFFFFF, 0
            uint32 masklo[4] = 0x00007FFF, 0x00007FFF, 0x00007FFF, 0x00007FFF
            uint32 _random_sign_mask[4] = 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF
            float _random_flt[4] = 0.000000000465661287524, 0.000000000465661287524, 0.000000000465661287524, 0.000000000465661287524

        """

    asm_code = data + """
        #CODE
        global random:
        movdqa xmm0, oword [gadd]
        movdqa xmm1, oword [mult]
        movdqa xmm2, oword [mask]


        pshufd xmm4, oword [cur_seed], 10110001b
        movdqa xmm5, oword [cur_seed]
        pmuludq xmm5, xmm1
        pshufd xmm1, xmm1, 10110001b
        pmuludq xmm4, xmm1
        pand xmm5, xmm2
        pand xmm4, xmm2
        pshufd xmm4, xmm4, 10110001b
        por xmm5, xmm4
        paddd xmm5, xmm0

        movdqa oword [cur_seed], xmm5

        ;convert to float
        pand xmm5, oword [_random_sign_mask]
        cvtdq2ps xmm0, xmm5
        mulps xmm0, oword [_random_flt]

        ret

    """

    avx_code = data + """
        #CODE
        global random:
        vmovdqa xmm0, oword [gadd]
        vmovdqa xmm1, oword [mult]
        vmovdqa xmm2, oword [mask]


        vpshufd xmm4, oword [cur_seed], 10110001b
        vmovdqa xmm5, oword [cur_seed]
        vpmuludq xmm5, xmm5, xmm1
        vpshufd xmm1, xmm1, 10110001b
        vpmuludq xmm4, xmm4, xmm1
        vpand xmm5, xmm5, xmm2
        vpand xmm4, xmm4, xmm2
        vpshufd xmm4, xmm4, 10110001b
        vpor xmm5, xmm5, xmm4
        vpaddd xmm5, xmm5, xmm0

        vmovdqa oword [cur_seed], xmm5

        ;convert to float
        vpand xmm5, xmm5, oword [_random_sign_mask]
        vcvtdq2ps xmm0, xmm5
        vmulps xmm0, xmm0, oword [_random_flt]
        
        ret

    """

    asm = Tdasm()
    if proc.AVX:
        mc = asm.assemble(avx_code, True)
    else:
        mc = asm.assemble(asm_code, True)

    return mc
示例#46
0
def log_ss():
    data = """
    #DATA
    float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0
    uint32 _ps_am_min_norm_pos[4] = 0x00800000, 0x00800000, 0x00800000, 0x00800000
    uint32 _ps_am_inv_mant_mask[4] = 0x807FFFFF, 0x807FFFFF, 0x807FFFFF, 0x807FFFFF 
    float _ps_log_p0[4] = -0.789580278884, -0.789580278884, -0.789580278884, -0.789580278884
    float _ps_log_q0[4] = -35.6722798256, -35.6722798256, -35.6722798256, -35.6722798256
    float _ps_log_p1[4] = 16.38666456995, 16.38666456995, 16.38666456995, 16.38666456995
    float _ps_log_q1[4] = 312.0937663722, 312.0937663722, 312.0937663722, 312.0937663722
    float _ps_log_p2[4] = -64.14099529587, -64.14099529587, -64.14099529587, -64.14099529587
    float _ps_log_q2[4] = -769.69194355046, -769.69194355046, -769.69194355046, -769.69194355046
    float _ps_log_c0[4] = 0.6931471805599, 0.6931471805599, 0.6931471805599, 0.6931471805599 

    """
    asm_code = data + """

    #CODE
    global fast_log_ss:
    maxss	xmm0, dword [_ps_am_min_norm_pos]  ; cut off denormalized stuff
    movss	xmm1, dword [_ps_am_1]
    movd	edx, xmm0

    andps	xmm0, oword [_ps_am_inv_mant_mask]
    orps	xmm0, xmm1

    movaps	xmm4, xmm0
    subss	xmm0, xmm1
    addss	xmm4, xmm1
    shr		edx, 23
    rcpss	xmm4, xmm4
    mulss	xmm0, xmm4
    addss	xmm0, xmm0

    movaps	xmm2, xmm0
    mulss	xmm0, xmm0
    sub		edx, 0x7f

    movss	xmm4, dword [_ps_log_p0]
    movss	xmm6, dword [_ps_log_q0]

    mulss	xmm4, xmm0
    movss	xmm5, dword [_ps_log_p1]
    mulss	xmm6, xmm0
    movss	xmm7, dword [_ps_log_q1]

    addss	xmm4, xmm5
    addss	xmm6, xmm7

    movss	xmm5, dword [_ps_log_p2]
    mulss	xmm4, xmm0
    movss	xmm7, dword [_ps_log_q2]
    mulss	xmm6, xmm0

    addss	xmm4, xmm5
    movss	xmm5, dword [_ps_log_c0]
    addss	xmm6, xmm7
    cvtsi2ss	xmm1, edx

    mulss	xmm0, xmm4
    rcpss	xmm6, xmm6

    mulss	xmm0, xmm6
    mulss	xmm0, xmm2

    mulss	xmm1, xmm5

    addss	xmm0, xmm2
    addss	xmm0, xmm1

    ret	


    """

    avx_code = data + """

    #CODE
    global fast_log_ss:
    vmaxss	xmm0, xmm0, dword [_ps_am_min_norm_pos]  ; cut off denormalized stuff
    vmovss	xmm1, dword [_ps_am_1]
    movd	edx, xmm0

    vandps	xmm0, xmm0, oword [_ps_am_inv_mant_mask]
    vorps	xmm0, xmm0, xmm1

    vmovaps	xmm4, xmm0
    vsubss	xmm0, xmm0, xmm1
    vaddss	xmm4, xmm4, xmm1
    shr		edx, 23
    vrcpss	xmm4, xmm4, xmm4
    vmulss	xmm0, xmm0, xmm4
    vaddss	xmm0, xmm0, xmm0

    vmovaps	xmm2, xmm0
    vmulss	xmm0, xmm0, xmm0
    sub		edx, 0x7f

    vmovss	xmm4, dword [_ps_log_p0]
    vmovss	xmm6, dword [_ps_log_q0]

    vmulss	xmm4, xmm4, xmm0
    vmovss	xmm5, dword [_ps_log_p1]
    vmulss	xmm6, xmm6, xmm0
    vmovss	xmm7, dword [_ps_log_q1]

    vaddss	xmm4, xmm4, xmm5
    vaddss	xmm6, xmm6, xmm7

    vmovss	xmm5, dword [_ps_log_p2]
    vmulss	xmm4, xmm4, xmm0
    vmovss	xmm7, dword [_ps_log_q2]
    vmulss	xmm6, xmm6, xmm0

    vaddss	xmm4, xmm4, xmm5
    vmovss	xmm5, dword [_ps_log_c0]
    vaddss	xmm6, xmm6, xmm7
    vcvtsi2ss	xmm1, xmm1, edx

    vmulss	xmm0, xmm0, xmm4
    vrcpss	xmm6, xmm6, xmm6

    vmulss	xmm0, xmm0, xmm6
    vmulss	xmm0, xmm0, xmm2

    vmulss	xmm1, xmm1, xmm5

    vaddss	xmm0, xmm0, xmm2
    vaddss	xmm0, xmm0, xmm1

    ret	


    """

    asm = Tdasm()
    if proc.AVX:
        mc = asm.assemble(avx_code, True)
    else:
        mc = asm.assemble(asm_code, True)

    return mc
示例#47
0
def sincos_ss():
    data = """
    #DATA
    uint32 _ps_am_inv_sign_mask[4] = 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF
    uint32 _ps_am_sign_mask[4] = 0x80000000, 0x80000000, 0x80000000, 0x80000000
    float _ps_am_pi_o_2[4] = 1.57079632679, 1.57079632679, 1.57079632679, 1.57079632679
    float _ps_am_2_o_pi[4] = 0.63661977236, 0.63661977236, 0.63661977236, 0.63661977236
    uint32 _epi32_1[4] = 1, 1, 1, 1
    float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0
    uint32 _epi32_2[4] = 2, 2, 2, 2

    float _ps_sincos_p3[4] = -0.00468175413, -0.00468175413, -0.00468175413, -0.00468175413
    float _ps_sincos_p2[4] = 0.0796926262, 0.0796926262, 0.0796926262, 0.0796926262
    float _ps_sincos_p1[4] = -0.64596409750621,-0.64596409750621,-0.64596409750621,-0.64596409750621 
    float _ps_sincos_p0[4] = 1.570796326794896, 1.570796326794896, 1.570796326794896, 1.570796326794896
    """
    asm_code = data + """
    #CODE
    global fast_sincos_ss:
    movaps	xmm7, xmm0
    movss	xmm1, dword [_ps_am_inv_sign_mask]
    movss	xmm2, dword [_ps_am_sign_mask]
    movss	xmm3, dword [_ps_am_2_o_pi]
    andps	xmm0, xmm1
    andps	xmm7, xmm2
    mulss	xmm0, xmm3

    pxor	xmm3, xmm3
    movd	xmm5, dword [_epi32_1]
    movss	xmm4, dword [_ps_am_1]

    cvttps2dq	xmm2, xmm0
    pand	xmm5, xmm2
    movd	xmm1, dword [_epi32_2]
    pcmpeqd	xmm5, xmm3
    movd	xmm3, dword [_epi32_1]
    cvtdq2ps	xmm6, xmm2
    paddd	xmm3, xmm2
    pand	xmm2, xmm1
    pand	xmm3, xmm1
    subss	xmm0, xmm6
    pslld	xmm2, 30
    minss	xmm0, xmm4
    ;mov		eax, [esp + 4 + 16]
    ;mov		edx, [esp + 4 + 16 + 4]
    subss	xmm4, xmm0
    pslld	xmm3, 30

    movaps	xmm6, xmm4
    xorps	xmm2, xmm7
    movaps	xmm7, xmm5
    andps	xmm6, xmm7
    andnps	xmm7, xmm0
    andps	xmm0, xmm5
    andnps	xmm5, xmm4
    movss	xmm4, dword [_ps_sincos_p3]
    orps	xmm6, xmm7
    orps	xmm0, xmm5
    movss	xmm5, dword [_ps_sincos_p2]

    movaps	xmm1, xmm0
    movaps	xmm7, xmm6
    mulss	xmm0, xmm0
    mulss	xmm6, xmm6
    orps	xmm1, xmm2
    orps	xmm7, xmm3
    movaps	xmm2, xmm0
    movaps	xmm3, xmm6
    mulss	xmm0, xmm4
    mulss	xmm6, xmm4
    movss	xmm4, dword [_ps_sincos_p1]
    addss	xmm0, xmm5
    addss	xmm6, xmm5
    movss	xmm5, dword [_ps_sincos_p0]
    mulss	xmm0, xmm2
    mulss	xmm6, xmm3
    addss	xmm0, xmm4
    addss	xmm6, xmm4
    mulss	xmm0, xmm2
    mulss	xmm6, xmm3
    addss	xmm0, xmm5
    addss	xmm6, xmm5
    mulss	xmm0, xmm1
    mulss	xmm6, xmm7

    ;use full stores since caller might reload with full loads
    ;movaps	[eax], xmm0
    ;movaps	[edx], xmm6

    ret	
    """

    avx_code = data + """
    #CODE
    global fast_sincos_ss:
    vmovaps	xmm7, xmm0
    vmovss	xmm1, dword [_ps_am_inv_sign_mask]
    vmovss	xmm2, dword [_ps_am_sign_mask]
    vmovss	xmm3, dword [_ps_am_2_o_pi]
    vandps	xmm0, xmm0, xmm1
    vandps	xmm7, xmm7, xmm2
    vmulss	xmm0, xmm0, xmm3

    vpxor	xmm3, xmm3, xmm3
    vmovd	xmm5, dword [_epi32_1]
    vmovss	xmm4, dword [_ps_am_1]

    vcvttps2dq	xmm2, xmm0
    vpand	xmm5, xmm5, xmm2
    vmovd	xmm1, dword [_epi32_2]
    vpcmpeqd	xmm5, xmm5, xmm3
    vmovd	xmm3, dword [_epi32_1]
    vcvtdq2ps	xmm6, xmm2
    vpaddd	xmm3, xmm3, xmm2
    vpand	xmm2, xmm2, xmm1
    vpand	xmm3, xmm3, xmm1
    vsubss	xmm0, xmm0, xmm6
    vpslld	xmm2, xmm2, 30
    vminss	xmm0, xmm0, xmm4
    ;mov		eax, [esp + 4 + 16]
    ;mov		edx, [esp + 4 + 16 + 4]
    vsubss	xmm4, xmm4, xmm0
    vpslld	xmm3, xmm3, 30

    vmovaps	xmm6, xmm4
    vxorps	xmm2, xmm2, xmm7
    vmovaps	xmm7, xmm5
    vandps	xmm6, xmm6, xmm7
    vandnps	xmm7, xmm7, xmm0
    vandps	xmm0, xmm0, xmm5
    vandnps	xmm5, xmm5, xmm4
    vmovss	xmm4, dword [_ps_sincos_p3]
    vorps	xmm6, xmm6, xmm7
    vorps	xmm0, xmm0, xmm5
    vmovss	xmm5, dword [_ps_sincos_p2]

    vmovaps	xmm1, xmm0
    vmovaps	xmm7, xmm6
    vmulss	xmm0, xmm0, xmm0
    vmulss	xmm6, xmm6, xmm6
    vorps	xmm1, xmm1, xmm2
    vorps	xmm7, xmm7, xmm3
    vmovaps	xmm2, xmm0
    vmovaps	xmm3, xmm6
    vmulss	xmm0, xmm0, xmm4
    vmulss	xmm6, xmm6, xmm4
    vmovss	xmm4, dword [_ps_sincos_p1]
    vaddss	xmm0, xmm0, xmm5
    vaddss	xmm6, xmm6, xmm5
    vmovss	xmm5, dword [_ps_sincos_p0]
    vmulss	xmm0, xmm0, xmm2
    vmulss	xmm6, xmm6, xmm3
    vaddss	xmm0, xmm0, xmm4
    vaddss	xmm6, xmm6, xmm4
    vmulss	xmm0, xmm0, xmm2
    vmulss	xmm6, xmm6, xmm3
    vaddss	xmm0, xmm0, xmm5
    vaddss	xmm6, xmm6, xmm5
    vmulss	xmm0, xmm0, xmm1
    vmulss	xmm6, xmm6, xmm7

    ;use full stores since caller might reload with full loads
    ;movaps	[eax], xmm0
    ;movaps	[edx], xmm6

    ret	
    """

    asm = Tdasm()
    if proc.AVX:
        mc = asm.assemble(avx_code, True)
    else:
        mc = asm.assemble(asm_code, True)

    return mc
示例#48
0
def asin_ps():
    data = """
    #DATA
    uint32 _ps_am_sign_mask[4] = 0x80000000, 0x80000000, 0x80000000, 0x80000000
    float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0
    float _ps_am_m1[4] = -1.0, -1.0, -1.0, -1.0
    float _ps_atan_t0[4] = -0.091646118527, -0.091646118527, -0.091646118527, -0.091646118527
    float _ps_atan_s0[4] = 1.2797564625, 1.2797564625, 1.2797564625, 1.2797564625
    float _ps_atan_s1[4] = 2.1972168858, 2.1972168858, 2.1972168858, 2.1972168858
    float _ps_atan_t1[4] = -1.395694568, -1.395694568, -1.395694568, -1.395694568
    float _ps_atan_s2[4] = 6.8193064723, 6.8193064723, 6.8193064723 ,6.8193064723
    float _ps_atan_t2[4] = -94.3939261227, -94.3939261227, -94.3939261227, -94.3939261227
    float _ps_atan_s3[4] = 28.205206687, 28.205206687, 28.205206687, 28.205206687
    float _ps_atan_t3[4] = 12.888383034, 12.888383034, 12.888383034, 12.888383034
    float _ps_am_pi_o_2[4] = 1.57079632679, 1.57079632679, 1.57079632679, 1.57079632679

    """
    asm_code = data + """

    #CODE
    global fast_asin_ps:
    movaps xmm1, oword [_ps_am_1]
    movaps xmm2, xmm1
    addps xmm1, xmm0
    subps xmm2, xmm0
    mulps xmm1, xmm2
    rsqrtps xmm1, xmm1
    mulps xmm0, xmm1

    ;atan
    movaps	xmm5, oword [_ps_am_1]
	movaps	xmm6, oword [_ps_am_m1]
	rcpps	xmm4, xmm0

	cmpps	xmm5, xmm0, 1
	cmpps	xmm6, xmm0, 6
	movaps	xmm1, oword [_ps_atan_s0]
	orps	xmm5, xmm6

	andps	xmm4, xmm5
	movaps	xmm2, oword [_ps_atan_t0]
	movaps	xmm7, xmm5
	andnps	xmm5, xmm0
	movaps	xmm3, oword [_ps_atan_s1]
	orps	xmm4, xmm5
	movaps	xmm0, xmm4

	movaps	xmm6, oword [_ps_atan_t1]
	mulps	xmm4, xmm4

	addps	xmm1, xmm4
	movaps	xmm5, oword [_ps_atan_s2]
	rcpps	xmm1, xmm1
	mulps	xmm1, xmm2
	movaps	xmm2, oword [_ps_atan_t2]
	addps	xmm3, xmm4
	addps	xmm1, xmm3

	movaps	xmm3, oword [_ps_atan_s3]
	rcpps	xmm1, xmm1
	mulps	xmm1, xmm6
	movaps	xmm6, oword [_ps_atan_t3]
	addps	xmm5, xmm4
	addps	xmm1, xmm5

	movaps	xmm5, oword [_ps_am_sign_mask]
	rcpps	xmm1, xmm1
	mulps	xmm1, xmm2
	addps	xmm3, xmm4
	movaps	xmm4, oword [_ps_am_pi_o_2]
	mulps	xmm6, xmm0
	addps	xmm1, xmm3

	andps	xmm0, xmm5
	rcpps	xmm1, xmm1
	mulps	xmm1, xmm6

	orps	xmm0, xmm4
	subps	xmm0, xmm1

	andps	xmm0, xmm7
	andnps	xmm7, xmm1
	orps	xmm0, xmm7
	ret

    """

    avx_code = data + """

    #CODE
    global fast_asin_ps:
    vmovaps xmm1, oword [_ps_am_1]
    vmovaps xmm2, xmm1
    vaddps xmm1, xmm1, xmm0
    vsubps xmm2, xmm2, xmm0
    vmulps xmm1, xmm1, xmm2
    vrsqrtps xmm1, xmm1
    vmulps xmm0, xmm0, xmm1

    ;atan
    vmovaps	xmm5, oword [_ps_am_1]
	vmovaps	xmm6, oword [_ps_am_m1]
	vrcpps	xmm4, xmm0

	vcmpps	xmm5, xmm5, xmm0, 1
	vcmpps	xmm6, xmm6, xmm0, 6
	vmovaps	xmm1, oword [_ps_atan_s0]
	vorps	xmm5, xmm5, xmm6

	vandps	xmm4, xmm4, xmm5
	vmovaps	xmm2, oword [_ps_atan_t0]
	vmovaps	xmm7, xmm5
	vandnps	xmm5, xmm5, xmm0
	vmovaps	xmm3, oword [_ps_atan_s1]
	vorps	xmm4, xmm4, xmm5
	vmovaps	xmm0, xmm4

	vmovaps	xmm6, oword [_ps_atan_t1]
	vmulps	xmm4, xmm4, xmm4

	vaddps	xmm1, xmm1, xmm4
	vmovaps	xmm5, oword [_ps_atan_s2]
	vrcpps	xmm1, xmm1
	vmulps	xmm1, xmm1, xmm2
	vmovaps	xmm2, oword [_ps_atan_t2]
	vaddps	xmm3, xmm3, xmm4
	vaddps	xmm1, xmm1, xmm3

	vmovaps	xmm3, oword [_ps_atan_s3]
	vrcpps	xmm1, xmm1
	vmulps	xmm1, xmm1, xmm6
	vmovaps	xmm6, oword [_ps_atan_t3]
	vaddps	xmm5, xmm5, xmm4
	vaddps	xmm1, xmm1, xmm5

	vmovaps	xmm5, oword [_ps_am_sign_mask]
	vrcpps	xmm1, xmm1
	vmulps	xmm1, xmm1, xmm2
	vaddps	xmm3, xmm3, xmm4
	vmovaps	xmm4, oword [_ps_am_pi_o_2]
	vmulps	xmm6, xmm6, xmm0
	vaddps	xmm1, xmm1, xmm3

	vandps	xmm0, xmm0, xmm5
	vrcpps	xmm1, xmm1
	vmulps	xmm1, xmm1, xmm6

	vorps	xmm0, xmm0, xmm4
	vsubps	xmm0, xmm0, xmm1

	vandps	xmm0, xmm0, xmm7
	vandnps	xmm7, xmm7, xmm1
	vorps	xmm0, xmm0, xmm7
	ret

    """

    asm = Tdasm()
    if proc.AVX:
        mc = asm.assemble(avx_code, True)
    else:
        mc = asm.assemble(asm_code, True)
    
    return mc
示例#49
0
        add dword [y], 1
        jmp _bltrgba
        _endblt:
        ret 
    """ 
    return code

def _blt_rgba_to_prgba_code():
    bits = platform.architecture()[0]
    if bits == '64bit':
        return _blt_floatrgba_code64()
    else:
        return _blt_floatrgba_code32()

_asm = Tdasm()
_mc = _asm.assemble(_blt_rgba_to_prgba_code())
_runtime = Runtime()
_data_section = _runtime.load("blt_rgba_to_prgba", _mc)


def blt_rgba_to_prgba(src, dest):

    assert isinstance(src, ImageRGBA)
    assert isinstance(dest, ImagePRGBA)

    #TODO blt only part of image
    sa, spitch = src.address_info() 
    da, dpitch = dest.address_info()
    dx = dy = sx = sy = 0
    sw, sh = src.size()
示例#50
0
    mov ebx, dword [eax + hitpoint.t]

    mov edx, dword [esp + 16] ;populate new minimum distance
    mov dword [edx], ebx
    jmp _next_object
    
    _end_objects:
    add esp, 20 
    ret


"""

asm = Tdasm()
renmas.shapes.multiple_isect_asm(runtime, "multiple_isect")
mc = asm.assemble(ASM)

def v4(v3):
    return (v3.x, v3.y, v3.z, 0.0)

ds = runtime.load("test", mc)

ray = ren.random_ray()
ds["ray1.origin"] = v4(ray.origin)
ds["ray1.dir"] = v4(ray.dir)
ds["num"] = len(lst_shapes)
ds["addrs"] = adrese

runtime.run("test")

print(ds["hp.t"], ds["clocks"])
示例#51
0
def tan_ss():
    data = """
    #DATA
    uint32 _ps_am_inv_sign_mask[4] = 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF
    float _ps_am_4_o_pi[4] = 1.273239544735, 1.273239544735, 1.273239544735, 1.273239544735
    float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0
    float _ps_am_pi_o_4[4] = 0.78539816339, 0.78539816339, 0.78539816339, 0.78539816339
    uint32 _ps_am_sign_mask[4] = 0x80000000, 0x80000000, 0x80000000, 0x80000000
    float _ps_tan_p0[4] = -17956525.197648, -17956525.197648, -17956525.197648, -17956525.197648 
    float _ps_tan_q0[4] = -53869575.592945, -53869575.592945, -53869575.592945, -53869575.592945 
    float _ps_tan_p1[4] = 1153516.64838587, 1153516.64838587, 1153516.64838587, 1153516.64838587
    float _ps_tan_q1[4] = 25008380.18233579, 25008380.18233579, 25008380.18233579, 25008380.18233579
    float _ps_tan_p2[4] = -13093.693918138, -13093.693918138, -13093.693918138, -13093.693918138
    float _ps_tan_q2[4] = -1320892.3444021, -1320892.3444021, -1320892.3444021, -1320892.3444021
    float _ps_tan_q3[4] = 13681.296347069, 13681.296347069, 13681.296347069, 13681.296347069
    float _ps_tan_poleval[4] = 36893500000000000000.0, 36893500000000000000.0, 36893500000000000000.0, 36893500000000000000.0

    """
    asm_code = data + """

    #CODE
    global fast_tan_ss:
    movss	xmm1, dword [_ps_am_inv_sign_mask]
    movd	eax, xmm0
    andps	xmm0, xmm1
    movaps	xmm1, xmm0
    mulss	xmm0, dword [_ps_am_4_o_pi]

    cvttss2si	edx, xmm0
    and		eax, 0x80000000

    mov		ecx, 0x1
    movd	xmm7, eax
    mov		eax, 0x7

    movss	xmm5, dword [_ps_am_1]

    and		ecx, edx
    and		eax, edx
    add		edx, ecx
    add		eax, ecx

    cvtsi2ss	xmm0, edx
    xorps	xmm6, xmm6

    mulss	xmm0, dword [_ps_am_pi_o_4]
    subss	xmm1, xmm0
    movss	xmm2, dword [_ps_tan_p2]
    minss	xmm1, xmm5
    movss	xmm3, dword [_ps_tan_q3]
    movaps	xmm0, xmm1
    mulss	xmm1, xmm1

    mulss	xmm2, xmm1
    addss	xmm3, xmm1
    addss	xmm2, dword [_ps_tan_p1]
    mulss	xmm3, xmm1
    mulss	xmm2, xmm1
    addss	xmm3, dword [_ps_tan_q2]
    addss	xmm2, dword [_ps_tan_p0]
    mulss	xmm3, xmm1
    mulss	xmm2, xmm1
    addss	xmm3, dword [_ps_tan_q1]
    xorps	xmm0, xmm7
    mulss	xmm3, xmm1
    mulss	xmm2, xmm0
    addss	xmm3, dword [_ps_tan_q0]

    rcpss	xmm4, xmm3
    mulss	xmm3, xmm4
    mulss	xmm3, xmm4
    addss	xmm4, xmm4
    test	eax, 0x2
    subss	xmm4, xmm3

    mulss	xmm2, xmm4
    jz		l_cont
    addss	xmm2, xmm0
    comiss	xmm6, xmm1

    rcpss	xmm4, xmm2
    movss	xmm0, dword [_ps_am_sign_mask]
    jz		l_pole
    mulss	xmm2, xmm4
    mulss	xmm2, xmm4
    addss	xmm4, xmm4
    subss	xmm4, xmm2
    xorps	xmm0, xmm4

    ret		

    l_pole:
    movss	xmm1, dword [_ps_tan_poleval]
    movaps	xmm3, xmm0
    andps	xmm0, xmm2
    orps	xmm0, xmm1

    xorps	xmm0, xmm3

    ret		

    l_cont:
    addss	xmm0, xmm2
    ret		


    """

    avx_code = data + """

    #CODE
    global fast_tan_ss:
    vmovss	xmm1, dword [_ps_am_inv_sign_mask]
    vmovd	eax, xmm0
    vandps	xmm0, xmm0, xmm1
    vmovaps	xmm1, xmm0
    vmulss	xmm0, xmm0, dword [_ps_am_4_o_pi]

    vcvttss2si	edx, xmm0
    and		eax, 0x80000000

    mov		ecx, 0x1
    vmovd	xmm7, eax
    mov		eax, 0x7

    vmovss	xmm5, dword [_ps_am_1]

    and		ecx, edx
    and		eax, edx
    add		edx, ecx
    add		eax, ecx

    vcvtsi2ss	xmm0, xmm0, edx
    vxorps	xmm6, xmm6, xmm6

    vmulss	xmm0, xmm0, dword [_ps_am_pi_o_4]
    vsubss	xmm1, xmm1, xmm0
    vmovss	xmm2, dword [_ps_tan_p2]
    vminss	xmm1, xmm1, xmm5
    vmovss	xmm3, dword [_ps_tan_q3]
    vmovaps	xmm0, xmm1
    vmulss	xmm1, xmm1, xmm1

    vmulss	xmm2, xmm2, xmm1
    vaddss	xmm3, xmm3, xmm1
    vaddss	xmm2, xmm2, dword [_ps_tan_p1]
    vmulss	xmm3, xmm3, xmm1
    vmulss	xmm2, xmm2, xmm1
    vaddss	xmm3, xmm3, dword [_ps_tan_q2]
    vaddss	xmm2, xmm2, dword [_ps_tan_p0]
    vmulss	xmm3, xmm3, xmm1
    vmulss	xmm2, xmm2, xmm1
    vaddss	xmm3, xmm3, dword [_ps_tan_q1]
    vxorps	xmm0, xmm0, xmm7
    vmulss	xmm3, xmm3, xmm1
    vmulss	xmm2, xmm2, xmm0
    vaddss	xmm3, xmm3, dword [_ps_tan_q0]

    vrcpss	xmm4, xmm4, xmm3
    vmulss	xmm3, xmm3, xmm4
    vmulss	xmm3, xmm3, xmm4
    vaddss	xmm4, xmm4, xmm4
    test	eax, 0x2
    vsubss	xmm4, xmm4, xmm3

    vmulss	xmm2, xmm2, xmm4
    jz		l_cont
    vaddss	xmm2, xmm2, xmm0
    vcomiss	xmm6, xmm1

    vrcpss	xmm4, xmm4, xmm2
    vmovss	xmm0, dword [_ps_am_sign_mask]
    jz		l_pole
    vmulss	xmm2, xmm2, xmm4
    vmulss	xmm2, xmm2, xmm4
    vaddss	xmm4, xmm4, xmm4
    vsubss	xmm4, xmm4, xmm2
    vxorps	xmm0, xmm0, xmm4

    ret		

    l_pole:
    vmovss	xmm1, dword [_ps_tan_poleval]
    vmovaps	xmm3, xmm0
    vandps	xmm0, xmm0, xmm2
    vorps	xmm0, xmm0, xmm1

    vxorps	xmm0, xmm0, xmm3

    ret		

    l_cont:
    vaddss	xmm0, xmm0, xmm2
    ret		


    """

    asm = Tdasm()
    if proc.AVX:
        mc = asm.assemble(avx_code, True)
    else:
        mc = asm.assemble(asm_code, True)

    return mc
示例#52
0
文件: sinss.py 项目: mario007/renmas
def sin_ss():
    data = """
    #DATA

    uint32 _ps_am_inv_sign_mask[4] = 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF
    uint32 _ps_am_sign_mask[4] = 0x80000000, 0x80000000, 0x80000000, 0x80000000
    float _ps_am_2_o_pi[4] = 0.63661977236, 0.63661977236, 0.63661977236, 0.63661977236
    uint32 _epi32_1[4] = 1, 1, 1, 1
    float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0
    uint32 _epi32_2[4] = 2, 2, 2, 2

    float _ps_sincos_p3[4] = -0.00468175413, -0.00468175413, -0.00468175413, -0.00468175413
    float _ps_sincos_p2[4] = 0.0796926262, 0.0796926262, 0.0796926262, 0.0796926262
    float _ps_sincos_p1[4] = -0.64596409750621,-0.64596409750621,-0.64596409750621,-0.64596409750621 
    float _ps_sincos_p0[4] = 1.570796326794896, 1.570796326794896, 1.570796326794896, 1.570796326794896

    """

    asm_code = data + """

    #CODE
    global fast_sin_ss:
	movaps	xmm7, xmm0
	movss	xmm1, dword [_ps_am_inv_sign_mask]
	movss	xmm2, dword [_ps_am_sign_mask]
	movss	xmm3, dword [_ps_am_2_o_pi]
	andps	xmm0, xmm1
	andps	xmm7, xmm2
	mulss	xmm0, xmm3

	pxor	xmm3, xmm3
	movd	xmm5, dword [_epi32_1]
	movss	xmm4, dword [_ps_am_1]
	cvttps2dq	xmm2, xmm0
	pand	xmm5, xmm2
	movd	xmm1, dword [_epi32_2]
	pcmpeqd	xmm5, xmm3
	cvtdq2ps	xmm6, xmm2
	pand	xmm2, xmm1
	pslld	xmm2, 30

	subss	xmm0, xmm6
	movss	xmm3, dword [_ps_sincos_p3]
	minss	xmm0, xmm4
	subss	xmm4, xmm0
	andps	xmm0, xmm5
	andnps	xmm5, xmm4
	orps	xmm0, xmm5

	movaps	xmm1, xmm0
	movss	xmm4, dword [_ps_sincos_p2]
	mulss	xmm0, xmm0
	xorps	xmm2, xmm7
	movss	xmm5, dword [_ps_sincos_p1]
	orps	xmm1, xmm2
	movaps	xmm7, xmm0
	mulss	xmm0, xmm3
	movss	xmm6, dword [_ps_sincos_p0]
	addss	xmm0, xmm4
	mulss	xmm0, xmm7
	addss	xmm0, xmm5
	mulss	xmm0, xmm7
	addss	xmm0, xmm6
	mulss	xmm0, xmm1
    ret
    """

    avx_code = data + """

    #CODE
    global fast_sin_ss:
    vmovaps	xmm7, xmm0 
	vmovss	xmm1, dword [_ps_am_inv_sign_mask]
	vmovss	xmm2, dword [_ps_am_sign_mask]
	vmovss	xmm3, dword [_ps_am_2_o_pi]

	vandps	xmm0, xmm0, xmm1
	vandps	xmm7, xmm7, xmm2 
	vmulss	xmm0, xmm0, xmm3

	vpxor	xmm3, xmm3, xmm3 
	vmovd	xmm5, dword [_epi32_1]
	vmovss	xmm4, dword [_ps_am_1]
	vcvttps2dq	xmm2, xmm0
	vpand	xmm5, xmm5, xmm2
	vmovd	xmm1, dword [_epi32_2]
	vpcmpeqd	xmm5, xmm5, xmm3
	vcvtdq2ps	xmm6, xmm2
	vpand	xmm2, xmm2, xmm1
	vpslld	xmm2, xmm2, 30

	vsubss	xmm0, xmm0, xmm6
	vmovss	xmm3, dword [_ps_sincos_p3]
	vminss	xmm0, xmm0, xmm4
	vsubss	xmm4, xmm4, xmm0
	vandps	xmm0, xmm0, xmm5
	vandnps	xmm5, xmm5, xmm4
	vorps	xmm0, xmm0, xmm5

	vmovaps	xmm1, xmm0
	vmovss	xmm4, dword [_ps_sincos_p2]
	vmulss	xmm0, xmm0, xmm0
	vxorps	xmm2, xmm2, xmm7
	vmovss	xmm5, dword [_ps_sincos_p1]
	vorps	xmm1, xmm1, xmm2
	vmovaps	xmm7, xmm0
	vmulss	xmm0, xmm0, xmm3
	vmovss	xmm6, dword [_ps_sincos_p0]
	vaddss	xmm0, xmm0, xmm4
	vmulss	xmm0, xmm0, xmm7
	vaddss	xmm0, xmm0, xmm5
	vmulss	xmm0, xmm0, xmm7
	vaddss	xmm0, xmm0, xmm6
	vmulss	xmm0, xmm0, xmm1
    ret
    """

    asm = Tdasm()
    if proc.AVX:
        mc = asm.assemble(avx_code, True)
    else:
        mc = asm.assemble(asm_code, True)

    return mc
示例#53
0
    #CODE
    call _memcpy 

    #END
    
    _memcpy:
    mov ecx, dword [n]
    mov esi, dword [sa] 
    mov edi, dword [da]
    rep movs byte [edi], byte [esi]

    ret
"""

asm = Tdasm()
m = asm.assemble(MEMCPY)
run = Runtime()
data_section = run.load("memcpy", m)


def memcpy(da, sa, n):
    data_section["da"] = da
    data_section["sa"] = sa
    data_section["n"] = n
    run.run("memcpy")


class DynamicArray:
    def __init__(self, struct, reserve=0):
        self.size = 0
        self.struct = struct
示例#54
0
        if gamma < 0.0: return False

        if beta + gamma > 1.0: return False

        e3 = a * p - b * r + d * s
        t = e3 * inv_denom

        if t < 0.00001: return False # self-intersection

        return (beta, gamma, t)

        
code = ray_triangle_intersection("ray_triangle_intersection")

asm = Tdasm()
mc = asm.assemble(code, True)

runtime = Runtime()
runtime.load('ray_triangle', mc)

# xmm3 - origin
# xmm4 - direction
# xmm5 - p0
# xmm6 - p1
# xmm7 - p2
# edx - min_distance

test_code = """
#DATA
float p0[4]
float p1[4]
示例#55
0
    if gamma < 0.0: return False

    if beta + gamma > 1.0: return False

    e3 = a * p - b * r + d * s
    t = e3 * inv_denom

    if t < 0.00001: return False  # self-intersection

    return (beta, gamma, t)


code = ray_triangle_intersection("ray_triangle_intersection")

asm = Tdasm()
mc = asm.assemble(code, True)

runtime = Runtime()
runtime.load('ray_triangle', mc)

# xmm3 - origin
# xmm4 - direction
# xmm5 - p0
# xmm6 - p1
# xmm7 - p2
# edx - min_distance

test_code = """
#DATA
float p0[4]
float p1[4]
示例#56
0
            code += lst_inst2[l] + "\n"
        for l in range(len(lst_inst2), len(lst_inst1)):
            code += lst_inst1[l] + "\n"

    return code

def arth128_32(tokens):
    return arth_mix(tokens, 128, 32)

def arth32_128(tokens):
    return arth_mix(tokens, 32, 128)

def arth128_128(tokens):
    return arth_mix(tokens, 128, 128)

def arth32_32(tokens):
    return arth_mix(tokens, 32, 32)

if __name__ == "__main__":
    asm = Tdasm()
    asm.register_macro("arth128", arth128)
    asm.register_macro("arth32", arth32)
    mc = asm.assemble(ASM_CODE)

    run = Runtime()
    ds = run.load("test", mc)
    run.run("test")

    print(ds["rez"])

示例#57
0
def pow_ss():
    data = """
    #DATA

    uint32 _epi32_1[4] = 1, 1, 1, 1
    float _ps_am_1[4] = 1.0, 1.0, 1.0, 1.0

    uint32 _ps_am_inv_mant_mask[4] = 0x807FFFFF,  0x807FFFFF,  0x807FFFFF,  0x807FFFFF 
    uint32 _ps_am_min_norm_pos[4] = 0x00800000, 0x00800000, 0x00800000, 0x00800000
    uint32 _epi32_0x7f[4] = 0x7F, 0x7F, 0x7F, 0x7F

    float _ps_log_p0[4] = -0.7895802788, -0.7895802788, -0.7895802788, -0.7895802788
    float _ps_log_q0[4] = -35.6722798256, -35.6722798256, -35.6722798256, -35.6722798256
    float _ps_log_p1[4] = 16.3866645699,  16.3866645699,  16.3866645699,  16.3866645699
    float _ps_log_q1[4] = 312.093766372,  312.093766372,  312.093766372,  312.093766372
    float _ps_log_p2[4] = -64.14099529,  -64.14099529,  -64.14099529,  -64.14099529
    float _ps_log_q2[4] = -769.691943550,  -769.691943550,  -769.691943550,  -769.691943550

    float _ps_log2_c0[4] = 1.442695040,  1.442695040,  1.442695040,  1.442695040
    float _ps_exp2_hi[4] = 127.4999961,  127.4999961,  127.4999961,  127.4999961
    float _ps_exp2_lo[4] = -127.4999961,  -127.4999961,  -127.4999961,  -127.4999961
    float _ps_am_0p5[4] = 0.5, 0.5, 0.5, 0.5
    float _ps_exp2_p0[4] = 0.0230933477, 0.0230933477, 0.0230933477, 0.0230933477
    float _ps_exp2_q0[4] = 233.18421172, 233.18421172, 233.18421172, 233.18421172 
    float _ps_exp2_p1[4] = 20.202065669,  20.202065669,  20.202065669,  20.202065669
    float _ps_exp2_q1[4] = 4368.211668, 4368.211668, 4368.211668, 4368.211668
    float _ps_exp2_p2[4] = 1513.90680, 1513.90680, 1513.90680, 1513.90680
    """
    asm_code = data + """

    #CODE
    global fast_pow_ss:
	xorps	xmm5, xmm5
	movss	xmm2, dword [_ps_am_inv_mant_mask]
	cmpss   xmm5, xmm0, 1
	maxss	xmm0, dword [_ps_am_min_norm_pos]  ;// cut off denormalized stuff
	movss	xmm7, dword [_ps_am_1]
	movaps	xmm3, xmm0

	andps	xmm0, xmm2
	orps	xmm0, xmm7

	movss	dword [esp - 4], xmm5

	movaps	xmm4, xmm0
	movd	xmm2, dword [_epi32_0x7f]
	subss	xmm0, xmm7
	addss	xmm4, xmm7
	psrld	xmm3, 23
	rcpss	xmm4, xmm4
	mulss	xmm0, xmm4
	psubd	xmm3, xmm2
	addss	xmm0, xmm0

	movaps	xmm2, xmm0
	mulss	xmm0, xmm0

	movss	xmm4, dword [_ps_log_p0]
	movss	xmm6, dword [_ps_log_q0]

	mulss	xmm4, xmm0
	movss	xmm5, dword [_ps_log_p1]
	mulss	xmm6, xmm0
	movss	xmm7, dword [_ps_log_q1]

	addss	xmm4, xmm5
	addss	xmm6, xmm7

	movss	xmm5, dword [_ps_log_p2]
	mulss	xmm4, xmm0
	movss	xmm7, dword [_ps_log_q2]
	mulss	xmm6, xmm0

	addss	xmm4, xmm5
	movss	xmm5, dword [_ps_log2_c0]
	addss	xmm6, xmm7
	cvtdq2ps	xmm7, xmm3

	mulss	xmm0, xmm4
	rcpss	xmm6, xmm6

	mulss	xmm0, xmm6
	movss	xmm4, dword [_ps_exp2_hi]
	mulss	xmm0, xmm2
	movss	xmm6, dword [_ps_exp2_lo]
	mulss	xmm2, xmm5
	mulss	xmm0, xmm5
	addss	xmm2, xmm7
	movss	xmm3, dword [_ps_am_0p5]
	addss	xmm0, xmm2
	xorps	xmm2, xmm2
	movd	xmm5, dword [_epi32_1]

	mulss	xmm0, xmm1

	minss	xmm0, xmm4
	movss	xmm4, dword [_ps_exp2_p0]
	maxss	xmm0, xmm6
	movss	xmm6, dword [_ps_exp2_q0]

	addss	xmm3, xmm0

	cmpss xmm2, xmm3, 5
	pand	xmm2, xmm5

	cvttps2dq	xmm3, xmm3

	psubd	xmm3, xmm2

	cvtdq2ps	xmm2, xmm3

	subss	xmm0, xmm2

	movaps	xmm2, xmm0
	mulss	xmm0, xmm0

	paddd	xmm3, oword [_epi32_0x7f]

	mulss	xmm4, xmm0
	mulss	xmm6, xmm0
	addss	xmm4, dword [_ps_exp2_p1]
	addss	xmm6, dword [_ps_exp2_q1]

	mulss	xmm4, xmm0
	addss	xmm4, dword [_ps_exp2_p2]

	mulss	xmm2, xmm4

	movss	xmm0, dword [_ps_am_1]
	subss	xmm6, xmm2
	pslld	xmm3, 23
	rcpss	xmm6, xmm6
	movss	xmm5, dword [esp - 4]
	mulss	xmm2, xmm6
	andps	xmm3, xmm5
	addss	xmm2, xmm2
	addss	xmm0, xmm2

	mulss	xmm0, xmm3
    ret
    """

    avx_code = data + """

    #CODE
    global fast_pow_ss:
	vxorps	xmm5, xmm5, xmm5
	vmovss	xmm2, dword [_ps_am_inv_mant_mask]
	vcmpss  xmm5, xmm5, xmm0, 1
	vmaxss	xmm0, xmm0, dword [_ps_am_min_norm_pos]  ;// cut off denormalized stuff
	vmovss	xmm7, dword [_ps_am_1]
	vmovaps	xmm3, xmm0

	vandps	xmm0, xmm0, xmm2
	vorps	xmm0, xmm0, xmm7

	vmovss	dword [esp - 4], xmm5

	vmovaps	xmm4, xmm0
	vmovd	xmm2, dword [_epi32_0x7f]
	vsubss	xmm0, xmm0, xmm7
	vaddss	xmm4, xmm4, xmm7
	vpsrld	xmm3, xmm3, 23
	vrcpss	xmm4, xmm4, xmm4
	vmulss	xmm0, xmm0, xmm4
	vpsubd	xmm3, xmm3, xmm2
	vaddss	xmm0, xmm0, xmm0

	vmovaps	xmm2, xmm0
	vmulss	xmm0, xmm0, xmm0

	vmovss	xmm4, dword [_ps_log_p0]
	vmovss	xmm6, dword [_ps_log_q0]

	vmulss	xmm4, xmm4, xmm0
	vmovss	xmm5, dword [_ps_log_p1]
	vmulss	xmm6, xmm6, xmm0
	vmovss	xmm7, dword [_ps_log_q1]

	vaddss	xmm4, xmm4, xmm5
	vaddss	xmm6, xmm6, xmm7

	vmovss	xmm5, dword [_ps_log_p2]
	vmulss	xmm4, xmm4, xmm0
	vmovss	xmm7, dword [_ps_log_q2]
	vmulss	xmm6, xmm6, xmm0

	vaddss	xmm4, xmm4, xmm5
	vmovss	xmm5, dword [_ps_log2_c0]
	vaddss	xmm6, xmm6, xmm7
	vcvtdq2ps	xmm7, xmm3

	vmulss	xmm0, xmm0, xmm4
	vrcpss	xmm6, xmm6, xmm6

	vmulss	xmm0, xmm0, xmm6
	vmovss	xmm4, dword [_ps_exp2_hi]
	vmulss	xmm0, xmm0, xmm2
	vmovss	xmm6, dword [_ps_exp2_lo]
	vmulss	xmm2, xmm2, xmm5
	vmulss	xmm0, xmm0, xmm5
	vaddss	xmm2, xmm2, xmm7
	vmovss	xmm3, dword [_ps_am_0p5]
	vaddss	xmm0, xmm0, xmm2
	vxorps	xmm2, xmm2, xmm2
	vmovd	xmm5, dword [_epi32_1]

	vmulss	xmm0, xmm0, xmm1

	vminss	xmm0, xmm0, xmm4
	vmovss	xmm4, dword [_ps_exp2_p0]
	vmaxss	xmm0, xmm0, xmm6
	vmovss	xmm6, dword [_ps_exp2_q0]

	vaddss	xmm3, xmm3, xmm0

	vcmpss xmm2, xmm2, xmm3, 5
	vpand	xmm2, xmm2, xmm5

	vcvttps2dq	xmm3, xmm3

	vpsubd	xmm3, xmm3, xmm2

	vcvtdq2ps	xmm2, xmm3

	vsubss	xmm0, xmm0, xmm2

	vmovaps	xmm2, xmm0
	vmulss	xmm0, xmm0, xmm0

	vpaddd	xmm3, xmm3, oword [_epi32_0x7f]

	vmulss	xmm4, xmm4, xmm0
	vmulss	xmm6, xmm6, xmm0
	vaddss	xmm4, xmm4, dword [_ps_exp2_p1]
	vaddss	xmm6, xmm6, dword [_ps_exp2_q1]

	vmulss	xmm4, xmm4, xmm0
	vaddss	xmm4, xmm4, dword [_ps_exp2_p2]

	vmulss	xmm2, xmm2, xmm4

	vmovss	xmm0, dword [_ps_am_1]
	vsubss	xmm6, xmm6, xmm2
	vpslld	xmm3, xmm3, 23
	vrcpss	xmm6, xmm6, xmm6
	vmovss	xmm5, dword [esp - 4]
	vmulss	xmm2, xmm2, xmm6
	vandps	xmm3, xmm3, xmm5
	vaddss	xmm2, xmm2, xmm2
	vaddss	xmm0, xmm0, xmm2

	vmulss	xmm0, xmm0, xmm3
    ret
    """

    asm = Tdasm()
    if proc.AVX:
        mc = asm.assemble(avx_code, True)
    else:
        mc = asm.assemble(asm_code, True)

    return mc