def __init__(self, input_file, output_file, MMB, index_type):

        self.input_file = input_file
        self.output_file = output_file
        self.MMB = MMB

        if index_type == "btree":
            self.index = BTree()
        elif index_type == "hash":
            self.index = MyHash()
        else:
            print("Invalid index type")
            exit(-1)

        self.dup_open()

        while True:
            rec = self.get_next()

            # input file finished
            if rec == None:
                self.flush_output()
                break

            # append the record to output buffer if unique
            if not self.index.search(rec):
                self.out_buff.append(rec)
                self.index.insert(rec)

            # if output buffer is full then flush the output
            if len(self.out_buff) >= self.NTB:
                self.flush_output()

        self.dup_close()
Exemplo n.º 2
0
    def test_keys_and_values(self):
        hash_table = MyHash()

        # random character generator for easy test insertion
        def random_val():
            return random.choice(string.lowercase)

        [hash_table.set(random_val(), random_val()) for i in range(8)]
        self.assertEqual(len(hash_table.values()), 8)
        self.assertEqual(len(hash_table.keys()), 8)
Exemplo n.º 3
0
    def test_delete(self):
        hash_table = MyHash()

        # Test basic deletion
        hash_table.set('hello', 'world')
        length = len(hash_table.values())
        hash_table.delete('hello')
        self.assertLess(len(hash_table.values()), length)
Exemplo n.º 4
0
    def test_resize(self):
        hash_table = MyHash()

        # test both increasing in size when load factor is met
        original_length = len(hash_table._buckets)
        for i in range(30):
            key = ''.join(random.choice(string.lowercase) for i in range(3))
            val = ''.join(random.choice(string.lowercase) for i in range(3))
            hash_table.set(key, val)

        new_length = len(hash_table._buckets)
        self.assertGreater(new_length, original_length)

        # test decreasing in size when load is too small
        for key in hash_table.keys()[:26]:
            hash_table.delete(key)

        new_length = len(hash_table._buckets)
        self.assertEqual(new_length, original_length)
class DuplicateElimination(object):

    def __init__(self, input_file, output_file, MMB, index_type):

        self.input_file = input_file
        self.output_file = output_file
        self.MMB = MMB

        if index_type == "btree":
            self.index = BTree()
        elif index_type == "hash":
            self.index = MyHash()
        else:
            print("Invalid index type")
            exit(-1)

        self.dup_open()

        while True:
            rec = self.get_next()

            # input file finished
            if rec == None:
                self.flush_output()
                break

            # append the record to output buffer if unique
            if not self.index.search(rec):
                self.out_buff.append(rec)
                self.index.insert(rec)

            # if output buffer is full then flush the output
            if len(self.out_buff) >= self.NTB:
                self.flush_output()

        self.dup_close()


    def flush_output(self):
        if len(self.out_buff) == 0:
            return

        #st = "\n".join([" ".join(list(map(str, r))) for r in self.out_buff]) + "\n"
        st = "".join([r for r in self.out_buff])
        self.out_fd.write(st)
        self.out_fd.flush()
        del self.out_buff[:]


    def dup_open(self):
        with open(self.input_file, "r") as f:
            self.NR = sum(1 for line in f if line.rstrip())

        with open(self.input_file, "r") as f:
            for line in f:
                if line.rstrip():
                    self.NC = len(line.rstrip().split())
                    break

        # assuming that each value in tuple is 32-bit (4-byte) int
        self.NTB = int(BLOCK_SIZE / (4 * self.NC))

        print("MMB = number of main memory blocks = ", self.MMB)
        print("NTB = number of tuples in a block = ", self.NTB)
        print("NR = number of tuples in relation = ", self.NR)
        print("NC = number of cols in relation = ", self.NC)
        print("BR = number of blocks in relation = ", self.NR//self.NTB)

        self.out_buff = []
        self.inp_buff = [[] for i in range(self.MMB - 1)]

        self.inp_idx = 0

        self.inp_fd = open(self.input_file, "r")
        self.out_fd = open(self.output_file, "w")


    def dup_close(self):
        self.inp_fd.close()
        self.out_fd.close()


    def get_next(self):
        cnt = 0
        while cnt <= len(self.inp_buff):
            ib = self.inp_buff[self.inp_idx]
            self.inp_idx = (self.inp_idx + 1)%len(self.inp_buff)
            cnt += 1

            if len(ib) != 0:
                return ib.pop(-1)
            else:
                for i in range(self.NTB):
                    row = self.inp_fd.readline()
                    if not row:
                        break
                    #row = [int(c) for c in row.rstrip().split()]
                    ib.append(row)

        return None
Exemplo n.º 6
0
    def test_set_and_get(self):
        hash_table = MyHash()
        # test basic set and get functionality for strings
        hash_table.set('hello', 'world')
        self.assertEqual(hash_table.get('hello'), 'world')

        # test basic set and get functionality for ints
        hash_table.set(3, 5)
        self.assertEqual(hash_table.get(3), 5)

        # test basic set and get functionality for objects
        class test(object):
            pass

        class test2(object):
            pass

        key = test()
        val = test2()

        hash_table.set(key, val)
        self.assertIsInstance(hash_table.get(key), test2)

        # test for invalid key
        self.assertRaises(KeyError, hash_table.get, 'test')