Exemplo n.º 1
0
 def test_fpgabackend_rawhls(self):
     # resource allocation function to set number of PE/SIMD per layer
     # the allocation is statically determined for this test case.
     def res_alloc_predetermined(pipeline, net, dev):
         ret_pipeline = copy.deepcopy(pipeline)
         layer_simd = [16, 64, 64, 64]
         layer_pe = [64, 64, 64, 10]
         for i in range(4):
             ret_pipeline[i].simd = layer_simd[i]
             ret_pipeline[i].pe = layer_pe[i]
         return ret_pipeline
     # make a temp dir for generated HLS
     dirpath = tempfile.mkdtemp()
     # pick all layers except first (input quantization) and last
     # (final batchnorm) of the streamlined network
     hlslayers = self.streamlined_net.layers[1:-1]
     # call the FPGA backend to generate HLS and compile raw HLS sim
     dev = device.Device('XLNX:PYNQ-Z1.json', 100)
     ret = fpga_backend.synthesize(hlslayers, self.net, dev, res_alloc_predetermined, dirpath, "sfcall-")
     hlspipeline = ret.getSimLayer()
     # build a "mixed pipeline", where the first and last layers are in
     # device-neutral simulation, and everything in the middle is handled
     # by the HLS sim executable
     mixed_pipeline = [self.streamlined_net.layers[0]] + hlspipeline + [self.streamlined_net.layers[-1]]
     # test on MNIST
     (ok, nok) = testOnMNIST(nn.NN(layers=mixed_pipeline), self.numImagesToTest)
     # remove temp dir
     #shutil.rmtree(dirpath)
     self.assertTrue(ok == self.ok_golden and nok == self.nok_golden)
Exemplo n.º 2
0
    def test_cycles_per_layer(self):
        l = CaffeLoader(
            None,
            "./FINN/inputs/dorefanet-pruned-without-extra-messages.prototxt")
        net = nn.NN(l)
        dev = device.Device('XLNX:KU115.json')
        perfmodel = pm.PerfModel(net, dev)
        fps = perfmodel.maximise_fps()
        for idx, layer in enumerate(net.layers):
            in_chans = net.layers[idx].getInputSize()
            out_chans = net.layers[idx].getOutputSize()
            out_dim = net.layers[idx].get_out_dim()

            if isinstance(in_chans, tuple):
                print in_chans
                in_chans = in_chans[0]
            if isinstance(out_chans, tuple):
                print out_chans
                out_chans = out_chans[0]
            if isinstance(out_dim, tuple):
                print out_dim
                out_dim = out_dim[0]

            print perfmodel.SIMD[idx], in_chans
            print perfmodel.PE[idx], out_chans
            print perfmodel.MMV[idx], out_dim
            self.assertLessEqual(perfmodel.SIMD[idx], in_chans)
            self.assertLessEqual(perfmodel.PE[idx], out_chans)
            self.assertLessEqual(perfmodel.MMV[idx], out_dim)
Exemplo n.º 3
0
 def test_simd_pe_mmv_constraints(self):
     l = CaffeLoader(None, "./FINN/inputs/sfc.prototxt")
     net = nn.NN(l)
     dev = device.Device('XLNX:KU115.json')
     perfmodel = pm.PerfModel(net, dev)
     fps = perfmodel.maximise_fps()
     for idx, layer in enumerate(net.layers):
         self.assertLessEqual(perfmodel.SIMD[idx], layer.getInputSize())
         self.assertLessEqual(perfmodel.PE[idx], layer.getOutputSize())
         self.assertLessEqual(perfmodel.MMV[idx], layer.get_out_dim())
Exemplo n.º 4
0
 def test_cycles_per_op(self):
     l = CaffeLoader("./FINN/inputs/sfc.caffemodel",
                     "./FINN/inputs/sfc.prototxt")
     net = nn.NN(l)
     dev = device.Device('XLNX:VU9P.json')
     perfmodel = pm.PerfModel(net, dev)
     ops = perfmodel.network_utilisation()
     num_matrix_layers = net.count_matrix_layers()
     self.assertEqual(ops['luts'],
                      2 * num_matrix_layers * dev.lut_cost_per_op())
Exemplo n.º 5
0
def demo_lfc():
    logging.basicConfig(
        filename='FINN.log',
        level=logging.INFO)  # Changed WARNING to INFO if you want logging
    lfcnetwork = []
    W0 = np.zeros((1024, 832))  # OutChans, InChans
    W1 = np.zeros((1024, 1024))
    W2 = np.zeros((1024, 1024))
    W3 = np.zeros((64, 1024))

    lfcnetwork.append(layers.FullyConnectedLayer(W0, 1, 1,
                                                 1))  # wbits, ibits, obits
    lfcnetwork.append(layers.FullyConnectedLayer(W1, 1, 1, 1))
    lfcnetwork.append(layers.FullyConnectedLayer(W2, 1, 1, 1))
    lfcnetwork.append(layers.FullyConnectedLayer(W3, 1, 1, 1))

    net = FINN.core.nn.NN(layers=lfcnetwork)

    dev = device.Device('XLNX:VU9P.json', frequency=192.4)
    perf = perf_model.PerfModel(net, dev)

    fps = perf.maximise_fps()

    # perf.SIMD[0] = 64
    # perf.SIMD[1] = 64
    # perf.SIMD[2] = 64
    # perf.SIMD[3] = 64
    #
    # perf.PE[0] = 256
    # perf.PE[1] = 256
    # perf.PE[2] = 256
    # perf.PE[3] = 16

    fps = perf.fps()

    perf.nswg.calculate_neural_folding()
    perf.nswg.calculate_write_block_cycles()
    perf.nswg.calculate_read_block_cycles()
    perf.nswg.calculate_total_cycles()
    perf.nswg.calculate_input_multipliers()
    perf.print_folding_factors()
    perf.print_hardware_cost()
    perf.print_topology()
    perf.print_cycles()
    fps = perf.fps()

    print "Achieved fps of %f with %f%% LUT utilisation and %f%% BRAM utilisation at %f Mhz" % (
        fps, perf.network_utilisation()['luts'] / dev.luts * 100,
        perf.network_utilisation()['brams'] / dev.brams * 100, dev.frequency)
Exemplo n.º 6
0
 def setUp(self):
     nname = "lfc-w1a1"
     proto = FINN_ROOT + "/inputs/%s.prototxt" % nname
     weights = FINN_ROOT + "/inputs/%s.caffemodel" % nname
     l = CaffeLoader(weights, proto)
     self.net = nn.NN(l)
     frequency = 300
     self.dev = device.Device('XLNX:VU9P.json', frequency)
     self.streamlined_net = copy.deepcopy(self.net)
     print self.streamlined_net.layers
     self.streamlined_net.layers = transform.makeCromulent(
         self.streamlined_net.layers)
     print self.streamlined_net.layers
     # use the first numImagesToTest of the test set for verification
     self.numImagesToTest = 1000
     # expected number of successful predictions
     self.ok_golden = 967
     # expected number of unsuccessful predictions
     self.nok_golden = 33
Exemplo n.º 7
0
def demo_hwgq_import():
    l = CaffeLoader(None, "inputs/sfc.prototxt")
    net = FINN.core.nn.NN(l)
    dev = device.Device('XLNX:KU115.json')
    perf = perf_model.PerfModel(net, dev)

    perf.print_folding_factors()
    perf.print_hardware_cost()

    for idx, val in enumerate(perf.SIMD):
        perf.SIMD[idx] = 5
        #perf.PE[idx]  = 10
    perf.print_folding_factors()
    perf.print_hardware_cost()

    for idx, val in enumerate(perf.SIMD):
        perf.SIMD[idx] = 20
        #perf.PE[idx]  = 100
    perf.print_folding_factors()
    perf.print_hardware_cost()
Exemplo n.º 8
0
def demo_sfc():
    logging.basicConfig(
        filename='FINN.log',
        level=logging.INFO)  # Changed WARNING to INFO if you want logging
    sfcnetwork = []

    W0 = np.zeros((64, 3, 3, 3))  # out, in, kernel, kernel
    W1 = np.zeros((64, 64, 3, 3))
    W2 = np.zeros((128, 64, 3, 3))
    W3 = np.zeros((128, 128, 3, 3))
    W4 = np.zeros((256, 128, 3, 3))
    W5 = np.zeros((256, 256, 3, 3))
    W6 = np.zeros((512, 256))
    W7 = np.zeros((512, 512))
    W8 = np.zeros((10, 512))

    sfcnetwork.append(layers.ConvolutionLayer(
        W0, 32, 0, 1, 1, 1, 1, 0))  # in_dim, pad, stride, wbits, ibits, obits
    sfcnetwork.append(layers.ConvolutionLayer(W1, 30, 0, 1, 1, 1, 1, 0))
    sfcnetwork.append(layers.ConvolutionLayer(W2, 14, 0, 1, 1, 1, 1, 0))
    sfcnetwork.append(layers.ConvolutionLayer(W3, 12, 0, 1, 1, 1, 1, 0))
    sfcnetwork.append(layers.ConvolutionLayer(W4, 5, 0, 1, 1, 1, 1, 0))
    sfcnetwork.append(layers.ConvolutionLayer(W5, 3, 0, 1, 1, 1, 1, 0))

    sfcnetwork.append(layers.FullyConnectedLayer(W6, 1, 1, 1))
    sfcnetwork.append(layers.FullyConnectedLayer(W7, 1, 1, 1))
    sfcnetwork.append(layers.FullyConnectedLayer(W8, 1, 1, 1))

    net = FINN.core.nn.NN(layers=sfcnetwork)

    dev = device.Device('XLNX:VU9P.json', frequency=248.5)  # Measured on AWS
    perf = perf_model.PerfModel(net, dev)

    fps = perf.maximise_fps()

    # From BNN spreadsheet, t3
    perf.SIMD[0] = 3
    perf.SIMD[1] = 64
    perf.SIMD[2] = 64
    perf.SIMD[3] = 64
    perf.SIMD[4] = 64
    perf.SIMD[5] = 64
    perf.SIMD[6] = 16
    perf.SIMD[7] = 16
    perf.SIMD[8] = 16

    perf.PE[0] = 64
    perf.PE[1] = 64
    perf.PE[2] = 64
    perf.PE[3] = 64
    perf.PE[4] = 64
    perf.PE[5] = 64
    perf.PE[6] = 16
    perf.PE[7] = 16
    perf.PE[8] = 4

    perf.MMV[0] = 1
    perf.MMV[1] = 1
    perf.MMV[2] = 1
    perf.MMV[3] = 1
    perf.MMV[4] = 1
    perf.MMV[5] = 1
    perf.MMV[6] = 1
    perf.MMV[7] = 1
    perf.MMV[8] = 1

    # FPS given the above folding factors
    fps = perf.fps()

    perf.nswg.calculate_neural_folding()
    perf.nswg.calculate_write_block_cycles()
    perf.nswg.calculate_read_block_cycles()
    perf.nswg.calculate_total_cycles()
    perf.nswg.calculate_input_multipliers()
    perf.print_folding_factors()
    perf.print_hardware_cost()
    perf.print_topology()
    perf.print_cycles()
    fps = perf.fps()

    print(perf.nswg)

    print "Achieved fps of %f with %f%% LUT utilisation and %f%% BRAM utilisation at %f Mhz" % (
        fps, perf.network_utilisation()['luts'] / dev.luts * 100,
        perf.network_utilisation()['brams'] / dev.brams * 100, dev.frequency)
Exemplo n.º 9
0
def demo_dorefa():
    logging.basicConfig(
        filename='FINN.log',
        level=logging.INFO)  # Changed WARNING to INFO if you want logging
    dorefanetwork = []

    W0 = np.zeros((68, 3, 12, 12))  # out, in, kernel, kernel
    W1 = np.zeros((90, 34, 5, 5))
    W2 = np.zeros((272, 180, 3, 3))
    W3 = np.zeros((192, 136, 3, 3))
    W4 = np.zeros((128, 192, 3, 3))
    W5 = np.zeros((4096, 9216))
    W6 = np.zeros((4096, 4096))
    W7 = np.zeros((1000, 4096))

    dorefanetwork.append(layers.ConvolutionLayer(
        W0, 227, 0, 4, 1, 1, 1, 0))  # in_dim, pad, stride, wbits, ibits, obits
    dorefanetwork.append(layers.ConvolutionLayer(W1, 58, 0, 1, 1, 1, 1, 0))
    dorefanetwork[-1].parallel = 2
    dorefanetwork.append(layers.ConvolutionLayer(W2, 29, 0, 1, 1, 1, 1, 0))
    dorefanetwork.append(layers.ConvolutionLayer(W3, 16, 0, 1, 1, 1, 1, 0))
    dorefanetwork[-1].parallel = 2
    dorefanetwork.append(layers.ConvolutionLayer(W4, 16, 0, 1, 1, 1, 1, 0))
    dorefanetwork[-1].parallel = 2

    dorefanetwork.append(layers.FullyConnectedLayer(W5, 1, 1, 1))
    dorefanetwork.append(layers.FullyConnectedLayer(W6, 1, 1, 1))
    dorefanetwork.append(layers.FullyConnectedLayer(W7, 1, 1, 1))

    net = FINN.core.nn.NN(layers=dorefanetwork)

    dev = device.Device('XLNX:VU9P.json', frequency=101)  # Measured on AWS
    perf = perf_model.PerfModel(net, dev)

    # From BNN spreadsheet, t3
    perf.SIMD[0] = 3
    perf.SIMD[1] = 34
    perf.SIMD[2] = 45
    perf.SIMD[3] = 34
    perf.SIMD[4] = 64
    perf.SIMD[5] = 64
    perf.SIMD[6] = 64
    perf.SIMD[7] = 8

    perf.PE[0] = 68
    perf.PE[1] = 90
    perf.PE[2] = 136
    perf.PE[3] = 64
    perf.PE[4] = 32
    perf.PE[5] = 32
    perf.PE[6] = 16
    perf.PE[7] = 32

    perf.MMV[0] = 18
    perf.MMV[1] = 3
    perf.MMV[2] = 3
    perf.MMV[3] = 1
    perf.MMV[4] = 1
    perf.MMV[5] = 1
    perf.MMV[6] = 1
    perf.MMV[7] = 1

    # FPS given the above folding factors
    fps = perf.fps()

    perf.nswg.calculate_neural_folding()
    perf.nswg.calculate_write_block_cycles()
    perf.nswg.calculate_read_block_cycles()
    perf.nswg.calculate_total_cycles()
    perf.nswg.calculate_input_multipliers()
    perf.print_folding_factors()
    perf.print_hardware_cost()
    perf.print_topology()
    perf.print_cycles()
    fps = perf.fps()

    print(perf.nswg)
    print "Achieved fps of %f with %f%% LUT utilisation and %f%% BRAM utilisation at %f Mhz" % (
        fps, perf.network_utilisation()['luts'] / dev.luts * 100,
        perf.network_utilisation()['brams'] / dev.brams * 100, dev.frequency)
Exemplo n.º 10
0
#       derived from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
# DISCLAIMED. IN NO EVENT SHALL <COPYRIGHT HOLDER> BE LIABLE FOR ANY
# DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import FINN.core.perf_model as pm
import FINN.core.device as device
import FINN.core.nn as nn
from FINN.frontend.caffeloader import CaffeLoader

print "Hardware model"

frequency = 200
#dev = device.Device('XLNX:PYNQ-Z1.json', frequency)
dev = device.Device('XLNX:VU9P.json', frequency)
net = nn.NN(CaffeLoader(None, './FINN/inputs/lfc-w1a1.prototxt'))
perfmodel = pm.PerfModel(net, dev)
fps = perfmodel.maximise_fps()
print "Network Utilization"

print perfmodel.network_utilisation()['luts']/dev.luts*100
print perfmodel.network_utilisation()['brams']/dev.brams*100