def test_linear_probe_bad_size(self): self.assertRaises(TypeError, lambda: ht.LinearProbe(None, hf.h_ascii)) self.assertRaises(TypeError, lambda: ht.LinearProbe('string', hf.h_ascii)) self.assertRaises(TypeError, lambda: ht.LinearProbe(sum, hf.h_ascii)) self.assertRaises(TypeError, lambda: ht.LinearProbe(float(420.69), hf.h_ascii))
def test_linear_probing(self): ht1 = hash_tables.LinearProbe(1000, hash_functions.h_ascii) ht2 = hash_tables.LinearProbe(1000, hash_functions.h_rolling) ht3 = hash_tables.LinearProbe(1000, hash_functions.h_myown) ht1.add('ABC', 30) ht2.add('ABC', 30) ht3.add('ABC', 30) self.assertEqual(ht1.search('ABC'), 30) self.assertEqual(ht1.search('DEF'), None) self.assertEqual(ht2.search('ABC'), 30) self.assertEqual(ht2.search('DEF'), None) self.assertEqual(ht3.search('ABC'), 30) self.assertEqual(ht3.search('DEF'), None)
def test_linear_probe_full_random(self): size = random.randint(1, 10000) table = hash_tables.LinearProbe(hash_functions.h_ascii, size) for i in range(size): table.insert(str(i), i) self.assertRaises(IndexError, table.insert, 'full', 10) assert table.search('full') == -1
def test_linear_probing(self): test_case = ht.LinearProbe(500, hf.h_sedgwicks) # Testing add self.assertEqual(test_case.add('key', 'value'), True) # Testing search self.assertEqual(test_case.search('key'), 'value') # Testing search not exist self.assertEqual(test_case.search('wrong_key'), None)
def test_linear_probe_bad_function_name(self): size = 100 hash_table = hash_tables.LinearProbe(size, "not function") key = ''.join(r.choices(s.ascii_uppercase + s.digits, k=100)) value = ''.join(r.choices(s.ascii_uppercase + s.digits, k=100)) self.assertFalse(hash_table.add(key, value)) self.assertEqual(hash_table.search(key), None)
def test_linear_probing(self): ascii_test = hash_tables.LinearProbe(1000, hash_functions.h_ascii) ascii_test.add('test_key', 'test_value') self.assertEqual(ascii_test.search('test_key'), 'test_value') self.assertNotEqual(ascii_test.search('test_key'), 'bad') self.assertEqual(ascii_test.search('bad_key'), None) rolling_test = hash_tables.LinearProbe(1000, hash_functions.h_rolling) rolling_test.add('test_key', 'test_value') self.assertEqual(rolling_test.search('test_key'), 'test_value') self.assertNotEqual(rolling_test.search('test_key'), 'bad') self.assertEqual(rolling_test.search('bad_key'), None) DJB_test = hash_tables.LinearProbe(1000, hash_functions.h_DJB) DJB_test.add('test_key', 'test_value') self.assertEqual(DJB_test.search('test_key'), 'test_value') self.assertNotEqual(DJB_test.search('test_key'), 'bad') self.assertEqual(DJB_test.search('bad_key'), None)
def test_linear_probe_add_search_random(self): table = hash_tables.LinearProbe(hash_functions.h_ascii, 100) key = '' for _ in range(random.randint(1, 10)): val = random.randint(97, 122) key += chr(val) value = random.randint(0, 10000) assert(table.insert(key, value) is True) assert(table.search(key) == value)
def test_linearprobe_h_ascii_single_element(self): table = ht.LinearProbe(1, ht.h_ascii) randstr = "" strlen = random.randint(1, 50) randval = random.randint(0, 999) for char in range(0, strlen): randstr += chr(random.randint(32, 126)) table.add(randstr, randval) self.assertEqual(randval, table.search(randstr))
def test_linear_probe_rehashing(self): size = 1000 hash_table = hash_tables.LinearProbe(size, hf.h_ascii) entries = {} for i in range(int(size * 2)): key = ''.join(r.choices(s.ascii_uppercase + s.digits, k=100)) value = ''.join(r.choices(s.ascii_uppercase + s.digits, k=100)) entries[key] = value self.assertTrue(hash_table.add(key, value)) for k, v in entries.items(): self.assertEqual(hash_table.search(k), v)
def test_linear_probe_nonexistent_key(self): size = 100 hash_table = hash_tables.LinearProbe(size, hf.h_ascii) entries = {} for i in range(int(size / 2)): key = ''.join(r.choices(s.ascii_uppercase + s.digits, k=100)) value = ''.join(r.choices(s.ascii_uppercase + s.digits, k=100)) entries[key] = value self.assertTrue(hash_table.add(key, value)) self.assertEqual( hash_table.search( "This is a key that is very unlikely to be generated"), None)
def test_linear_probe_ascii_variable_add_search(self): for i in range(100): test_length = rdm.randint(1, 100) letters = string.ascii_lowercase + string.ascii_uppercase test_value = rdm.randint test_key = '' for j in range(rdm.randint(1, 100)): letter = rdm.choice(letters) test_key += letter test_table = ht.LinearProbe(test_length, hf.h_ascii) test_table.add(test_key, test_value) self.assertEqual((test_key, test_value), test_table.T[hf.h_ascii(test_key, test_length)]) self.assertEqual(test_value, test_table.search(test_key))
def test_linear_probe_rolling_collision(self): for i in range(100): test_length = rdm.randint(2, 1000) test_value1 = rdm.randint(1, 1000) test_value2 = rdm.randint(1, 1000) test_key = 'teststring' test_table = ht.LinearProbe(test_length, hf.h_rolling) test_table.add(test_key, test_value1) test_table.add(test_key, test_value2) self.assertEqual(test_value1, test_table.search(test_key)) if test_table.N - 1 == hf.h_rolling(test_key, test_length): self.assertEqual((test_key, test_value2), test_table.T[0]) continue self.assertEqual( (test_key, test_value2), test_table.T[hf.h_rolling(test_key, test_length) + 1])
def time_unsorted(arg, unsorted_data): if arg == 'hash': table = hash_tables.LinearProbe(100000, hash_functions.h_rolling) t0 = time.time() for i in range(len(unsorted_data)): table.add(unsorted_data[i][0], unsorted_data[i][1]) t1 = time.time() elapsed_unsorted_insert = t1 - t0 if arg == 'tree': root = None t0 = time.time() for i in range(len(unsorted_data)): bt.insert(root, int(unsorted_data[i][0]), unsorted_data[i][1]) t1 = time.time() elapsed_unsorted_insert = t1 - t0 return elapsed_unsorted_insert
def test_linearprobe_h_rolling_multiple_elements(self): tablesize = 1000 table = ht.LinearProbe(tablesize, ht.h_rolling) tabledict = {} for i in range(0, 500): randkey = "" randomval = random.randint(0, 100) for i in range(0, random.randint(1, 50)): randkey += chr(random.randint(32, 126)) if randkey in tabledict: continue else: if table.add(randkey, randomval) == -1: break else: tabledict[randkey] = randomval table.add(randkey, randomval) for key in tabledict: self.assertEqual(tabledict[key], table.search(key))
def test_search_bad_value(self): test = ht.LinearProbe(50, hf.h_ascii) test.add('text', 'value') self.assertEqual(test.search('nothere'), None)
def test_linear_probe_replace_key(self): table = hash_tables.LinearProbe(hash_functions.h_ascii, 30) table.insert('ayo', 10) table.insert('ayo', 100) assert table.capacity == 1 assert table.search('ayo') == 100
def test_linear_probe_key_not_in_table(self): table = hash_tables.LinearProbe(hash_functions.h_ascii, 30) assert table.search('not in table') == -1
def testLinearProbe_add_to_full_ascii(self): x = random.randint(0, 100) y = hash_functions.h_ascii test = hash_tables.LinearProbe(x, y) test.T = [str(random.randint(0, 100)) for i in range(test.N)] self.assertFalse(test.add('key', 10))
def testLinearProbe_search_not_in_table_ascii(self): test = hash_tables.LinearProbe(10, hash_functions.h_ascii) test.T = [str(random.randint(0, 100)) for i in range(test.N)] self.assertFalse(test.search('key'))
def test_linear_probe_search_1(self): table = hash_tables.LinearProbe(hash_functions.h_ascii, 100) table.insert('woah!', 1) assert(table.search('woah!') == 1)
def test_linear_probe_add_empty(self): table = hash_tables.LinearProbe(hash_functions.h_ascii, 100) assert(table.insert('woah!', 1) is True)
def testLinearProbe_search_in_table_python(self): test = hash_tables.LinearProbe(10, hash_functions.h_python) test.T = [(str(i), 2 * i) for i in range(test.N)] self.assertEqual(test.search('3'), 6)
def test_linear_probe_bad_fxn(self): self.assertRaises(TypeError, lambda: ht.LinearProbe(5, None)) self.assertRaises(TypeError, lambda: ht.LinearProbe(5, 'string')) self.assertRaises(TypeError, lambda: ht.LinearProbe(5, int(5))) self.assertRaises(TypeError, lambda: ht.LinearProbe(5, float(420.69)))
def test_no_overwrite(self): test = ht.LinearProbe(50, hf.h_ascii) test.add('text', 'value') test.add('text', 'newvalue') self.assertEqual(test.T[3][1], 'value')
def test_search_function(self): test = ht.LinearProbe(50, hf.h_ascii) test.add('text', 'value') self.assertEqual(test.search('text'), 'value')
def testLinearProbe_add_to_empty_ascii(self): x = random.randint(0, 100) y = hash_functions.h_ascii test = hash_tables.LinearProbe(x, y) self.assertTrue(test.add('key', 10))
def test_linear_probe_search_key_none(self): test_table = ht.LinearProbe(5, hf.h_ascii) self.assertEqual(None, test_table.search(None))
def test_linear_probe_add_key_none(self): test_table = ht.LinearProbe(5, hf.h_ascii) self.assertEqual(None, test_table.add(None, 420))
def test_add_function(self): test = ht.LinearProbe(50, hf.h_ascii) test.add('text', 'value') self.assertEqual(test.T[3][1], 'value')
def main(): # Argparse Defns parser = argparse.ArgumentParser(description='Plot gene expression for' ' tissue type and ' 'tissue group given a gene') parser.add_argument('--gene_reads', type=str, help='File containing gene reads', required=True) parser.add_argument('--sample_attributes', type=str, help='File containing the sample attributes', required=True) parser.add_argument('--gene', type=str, help='Name of the gene you wish to analyze', required=True) parser.add_argument( '--group_type', type=str, help='Name of the group of samples you wish to analyze expression for', required=True) parser.add_argument('--output_file', type=str, help='Name of the file the boxplot will be saved to', required=True) args = parser.parse_args() # Defines file names data_file_name = args.gene_reads sample_info_file_name = args.sample_attributes # Defines variable names sample_id_col_name = 'SAMPID' tissue_group_col_name = args.group_type gene_name = args.gene # samples is a list that stores each # sample and it's attributes as a list within the larger list # info_header is a parallel array to each list element within samples samples = [] info_header = None try: num_samp = 0 for l in open(sample_info_file_name): if info_header is None: info_header = l.rstrip().split('\t') else: samples.append(l.rstrip().split('\t')) num_samp += 1 except ValueError: print('Could not read sample info file') N_samp = int(100000) N_groups = 1000 # Initalizes hash tables group_table = ht.ChainedHash(N_groups, hf.h_rolling) read_table = ht.LinearProbe(N_samp, hf.h_rolling) # stores the index of attributes for samples/info_header arrays tissue_group_col_idx = linear_search(tissue_group_col_name, info_header) sample_id_col_idx = linear_search(sample_id_col_name, info_header) # writes the first hash table try: for row_idx in range(len(samples)): sample = samples[row_idx] sample_name = sample[sample_id_col_idx] curr_group = sample[tissue_group_col_idx] group_table.add(curr_group, sample_name) except ValueError: print('Could not assign Sample IDs') version = None dim = None data_header = None gene_name_col = 1 try: for l in gzip.open(data_file_name, 'rt'): if version is None: version = l continue if dim is None: dim = [int(x) for x in l.rstrip().split()] continue # Sorts the data header so binary_search can be utilized if data_header is None: data_header = [] i = 0 for field in l.rstrip().split('\t'): data_header.append([field, i]) i += 1 data_header.sort(key=lambda tup: tup[0]) A = l.rstrip().split('\t') if A[gene_name_col] == gene_name: for i in range(2, len(data_header) - 2): read_table.add(str(data_header[i][0]), A[i]) except ValueError: print('Could not read data info file') # group_counts stores the associated # gene counts for each sample within lists # at the same index position as their groups groups = list(set(group_table.keys)) group_counts = [[] for i in range(len(groups))] for group in range(len(groups)): for i in range(len(group_table.T)): if group_table.T[i] != []: if group_table.T[i][0][0] == groups[group]: for sample in range(len(group_table.T[i])): read = read_table.search( str(group_table.T[i][sample][1])) if read is not None: group_counts[group].append(int(read)) # This portion utilized parallel arrays # # samples is a list that stores each # # sample and it's attributes as a list within the larger list # # info_header is a parallel array to each list element within samples # samples = [] # info_header = None # # try: # for l in open(sample_info_file_name): # if info_header is None: # info_header = l.rstrip().split('\t') # else: # samples.append(l.rstrip().split('\t')) # except ValueError: # print('Could not read sample info file') # # # stores the index of attributes for samples/info_header arrays # tissue_group_col_idx = linear_search(tissue_group_col_name, info_header) # sample_id_col_idx = linear_search(sample_id_col_name, info_header) # # # group is an array that stores each tissue group # # groupmembers stores lists of sample IDs of # # groups in the same index location as the group array # groups = [] # groupmembers = [] # # try: # for row_idx in range(len(samples)): # sample = samples[row_idx] # sample_name = sample[sample_id_col_idx] # curr_group = sample[tissue_group_col_idx] # curr_group_idx = linear_search(curr_group, groups) # # if curr_group_idx == -1: # curr_group_idx = len(groups) # groups.append(curr_group) # groupmembers.append([]) # # groupmembers[curr_group_idx].append(sample_name) # except ValueError: # print('Could not assign Sample IDs') # # # group_counts stores the associated # # gene counts for each sample within lists # # at the same index position as their groups # group_counts = [[] for i in range(len(groups))] # # version = None # dim = None # data_header = None # # gene_name_col = 1 # # try: # for l in gzip.open(data_file_name, 'rt'): # if version is None: # version = l # continue # # if dim is None: # dim = [int(x) for x in l.rstrip().split()] # continue # # # Sorts the data header so binary_search can be utilized # if data_header is None: # data_header = [] # i = 0 # for field in l.rstrip().split('\t'): # data_header.append([field, i]) # i += 1 # data_header.sort(key=lambda tup: tup[0]) # # A = l.rstrip().split('\t') # # if A[gene_name_col] == gene_name: # for group_idx in range(len(groups)): # for member in groupmembers[group_idx]: # member_idx = binary_search(member, data_header) # if member_idx != -1: # group_counts[group_idx].append(int(A[member_idx])) # # break # except ValueError: # print('Could not read data info file') data_viz.boxplot(group_counts, groups, str(args.gene) + ' Expression of Tissue Group', 'Tissue Group = ' + str(args.group_type), str(args.gene) + ' Counts', args.output_file)