def test_chained_hash_rolling_collision(self):
     for i in range(100):
         test_length = rdm.randint(2, 1000)
         test_value1 = rdm.randint(1, 1000)
         test_value2 = rdm.randint(1, 1000)
         test_key = 'teststring'
         test_table = ht.ChainedHash(test_length, hf.h_rolling)
         test_table.add(test_key, test_value1)
         test_table.add(test_key, test_value2)
         self.assertEqual(test_value1, test_table.search(test_key))
         self.assertEqual((test_key, test_value2),
                          test_table.T[hf.h_rolling(test_key,
                                                    test_length)][1])
    def test_chained_hash_ascii_basic(self):
        size = 10000
        hash_table = hash_tables.ChainedHash(size, hf.h_ascii)

        entries = {}
        for i in range(int(size / 2)):
            key = ''.join(r.choices(s.ascii_uppercase + s.digits, k=100))
            value = ''.join(r.choices(s.ascii_uppercase + s.digits, k=100))
            entries[key] = value
            self.assertTrue(hash_table.add(key, value))

        for k, v in entries.items():
            self.assertEqual(hash_table.search(k), v)
    def test_chained_hash_nonexistent_key(self):
        size = 100
        hash_table = hash_tables.ChainedHash(size, hf.h_ascii)

        entries = {}
        for i in range(int(size / 2)):
            key = ''.join(r.choices(s.ascii_uppercase + s.digits, k=100))
            value = ''.join(r.choices(s.ascii_uppercase + s.digits, k=100))
            entries[key] = value
            self.assertTrue(hash_table.add(key, value))

        self.assertEqual(
            hash_table.search(
                "This is a key that is very unlikely to be generated"), None)
 def test_chained_hash_rolling_variable_key_store(self):
     letters = string.ascii_lowercase + string.ascii_uppercase
     for i in range(10):
         keys = []
         test_length = rdm.randint(1, 100)
         test_table = ht.ChainedHash(test_length, hf.h_rolling)
         for k in range(50):
             test_value = rdm.randint
             test_key = ''
             for j in range(rdm.randint(1, 100)):
                 letter = rdm.choice(letters)
                 test_key += letter
             keys.append(test_key)
             test_table.add(test_key, test_value)
     self.assertEqual(keys, test_table.keys)
 def test_chained_hash_rolling_variable_add_search(self):
     for i in range(100):
         test_length = rdm.randint(1, 100)
         letters = string.ascii_lowercase + string.ascii_uppercase
         test_value = rdm.randint
         test_key = ''
         for j in range(rdm.randint(1, 100)):
             letter = rdm.choice(letters)
             test_key += letter
         test_table = ht.ChainedHash(test_length, hf.h_rolling)
         test_table.add(test_key, test_value)
         self.assertEqual((test_key, test_value),
                          test_table.T[hf.h_rolling(test_key,
                                                    test_length)][0])
         self.assertEqual(test_value, test_table.search(test_key))
Exemplo n.º 6
0
    def test_chainedhash_h_ascii_multiple_elements(self):
        tablesize = 1000
        table = ht.ChainedHash(tablesize, ht.h_ascii)
        tabledict = {}

        for i in range(0, 500):
            randkey = ""
            randomval = random.randint(0, 100)
            for i in range(0, random.randint(1, 50)):
                randkey += chr(random.randint(32, 126))
            if randkey in tabledict:
                continue
            else:
                if table.add(randkey, randomval) == -1:
                    break
                else:
                    tabledict[randkey] = randomval
                    table.add(randkey, randomval)

        for key in tabledict:
            self.assertEqual(tabledict[key], table.search(key))
Exemplo n.º 7
0
 def test_add_function(self):
     test = ht.ChainedHash(50, hf.h_ascii)
     test.add('text', 'value')
     self.assertEqual(test.T[3][0][1], 'value')
 def test_chained_hash_search_key_none(self):
     test_table = ht.ChainedHash(5, hf.h_ascii)
     self.assertEqual(None, test_table.search(None))
def main():

    arguments = parse_arguments()

    data_file_name = arguments.data_file
    sample_info_file_name = arguments.sample_file
    group_col_name = arguments.sample_type
    sample_id_col_name = 'SAMPID'

    gene_name = arguments.gene

    sample_info_header, samples = parse_sample_file(sample_info_file_name)

    key = linear_search(group_col_name, sample_info_header)

    table = hash_tables.ChainedHash(50, hash_functions.h_rolling)
    keys = []
    for i in samples:
        result = table.search(i[key])
        if result is None:
            table.add(i[key], [i[0]])
            keys.append(i[key])
        else:
            loc = table.search_loc(i[key])
            table.T[loc][0][1].append(i[0])

    version = None
    dim = None
    data_header = None

    gene_name_col = 1

    table_2 = hash_tables.ChainedHash(10000, hash_functions.h_rolling)

    for l in gzip.open(data_file_name, 'rt'):
        if version == None:
            version = l
            continue

        if dim == None:
            dim = [int(x) for x in l.rstrip().split()]
            continue

        if data_header == None:
            data_header = []
            i = 0
            for field in l.rstrip().split('\t'):
                data_header.append([field, i])
                i += 1
            data_header.sort(key=lambda tup: tup[0])

            continue

        A = l.rstrip().split('\t')

        if A[gene_name_col] == 'BRCA2':
            for header, gene_data in zip(data_header[2:], A[2:]):
                table_2.add(header[0], gene_data)
    group_counts = [[] for _ in range(len(keys))]
    for i in range(len(keys)):
        for val in table.search(keys[i]):
            result = table_2.search(val)
            if result is not None:
                group_counts[i].append(int(result))

    dv.boxplot(group_counts,
               keys,
               ylabel='Gene Read Counts',
               xlabel=arguments.sample_type,
               title=arguments.gene,
               out_file_name=arguments.output_filename)
Exemplo n.º 10
0
 def testChainedHash_search_not_in_table_ascii(self):
     test = hash_tables.ChainedHash(10, hash_functions.h_ascii)
     self.assertFalse(test.search('key'))
Exemplo n.º 11
0
 def test_chained_hash_key_not_in_table(self):
     table = hash_tables.ChainedHash(hash_functions.h_ascii, 30)
     assert table.search('not in table') == -1
Exemplo n.º 12
0
 def test_chained_hash_add_empty(self):
     table = hash_tables.ChainedHash(hash_functions.h_ascii, 100)
     assert(table.insert('woah!', 1) is True)
     assert('woah!' in table.keys)
Exemplo n.º 13
0
 def test_search_bad_value(self):
     test = ht.ChainedHash(50, hf.h_ascii)
     test.add('text', 'value')
     self.assertEqual(test.search('nothere'), None)
def main():
    parser = argparse.ArgumentParser(
        description='find tissue counts for specific gene', prog='bay')
    parser.add_argument('--gene_reads',
                        type=str,
                        help='GTEX gene counts',
                        required=True)
    parser.add_argument('--sample',
                        type=str,
                        help='GTEX samples file',
                        required=True)
    parser.add_argument('--group_type',
                        type=str,
                        help='group: either SMTS or SMTSD',
                        required=True)
    parser.add_argument('--gene', type=str, help='gene name', required=True)
    parser.add_argument('--output_file',
                        type=str,
                        help='desired output file name',
                        required=True)

    args = parser.parse_args()

    version = None
    dim = None
    count_headers = None
    for l in open(args.gene_reads, 'rt'):
        if version is None:
            v = l
            continue
        if dim is None:
            dim = l
            continue
        if count_headers is None:
            count_headers = l.rstrip.split('\t')
            for i in range(len(count_headers)):
                cch.append([count_headers[i], i])
            continue

    counts = l.rstrip().split('\t')
    desc = linear_search('Description', count_headers)

    if counts[desc] == args.gene:
        to_return = []
        chainedhash = ht.ChainedHash(1000000, hf.h_rolling)
        for i in range(desc + 1, len(count_headers)):
            chainedhash.add(count_headers[i], int(counts[i]))
        for t in group:
            list_counts = []
            location = table.search(t)
            if location is None:
                continue
            for s in location:
                count = chainedhash.search(s)
                if count is None:
                    continue
                list_counts.append(count)
            to_return.append(list_counts)

        dv.boxplot(to_return,
                   args.output_file,
                   'x',
                   'y',
                   'title',
                   groups=group)
def main():

    # data_file_name='GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.acmg_59.gct.gz'
    data_file_name = args.gene_reads
    # sample_info_file_name='GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt'
    sample_info_file_name = args.sample_attributes

    # samples_to_count_map = get_samples_to_count_map

    # group_col_name = 'SMTS'
    group_col_name = args.group_type
    if (group_col_name != 'SMTS') and (group_col_name != 'SMTSD'):
        print('--group_type must be either SMTS or SMTSD')
        sys.exit(1)

    sample_id_col_name = 'SAMPID'

    # gene_name = 'ACTA2'
    gene_name = args.gene

    samples = []
    sample_info_header = None

    try:
        for l in open(sample_info_file_name):
            if sample_info_header is None:
                sample_info_header = l.rstrip().split('\t')
            else:
                samples.append(l.rstrip().split('\t'))
    except FileNotFoundError:
        print('--sample_attributes could not be found')
        sys.exit(1)

    try:
        group_col_idx = linear_search(group_col_name, sample_info_header)
        sample_id_col_idx = linear_search(sample_id_col_name,
                                          sample_info_header)
    except TypeError:
        print('--sample_attributes is not formatted properly,' +
              ' check that it is not empty')
        sys.exit(1)

    tissue_to_samples_map = ht.ChainedHash(1000000, hf.h_rolling)
    groups = []

    for s in samples:
        key = s[group_col_idx]
        value = s[sample_id_col_idx]

        in_group = linear_search(key, groups)
        if in_group == -1:
            groups.append(key)

        hit = tissue_to_samples_map.search(key)

        if hit is None:
            tissue_to_samples_map.add(key, [])
            hit = tissue_to_samples_map.search(key)
        hit.append(value)

    version = None
    dim = None
    data_header = None

    gene_name_col = 1

    group_counts = [[] for i in range(len(groups))]

    gene_hits = 0

    samples_to_count_map = ht.ChainedHash(1000000, hf.h_rolling)
    try:
        for l in gzip.open(data_file_name, 'rt'):
            if version is None:
                version = l
                continue

            if dim is None:
                dim = [int(x) for x in l.rstrip().split()]
                continue

            if data_header is None:
                i = 0
                for field in l.rstrip().split('\t'):
                    samples_to_count_map.add(field, i)
                    i += 1
                data_header = 1

                continue

            A = l.rstrip().split('\t')

            if A[gene_name_col] == gene_name:
                gene_hits += 1
                for group_idx in range(len(groups)):
                    members = tissue_to_samples_map.search(groups[group_idx])
                    for member in members:
                        member_idx = samples_to_count_map.search(member)
                        if member_idx is not None:
                            group_counts[group_idx].append(int(A[member_idx]))
                break
    except OSError:
        print('--gene_reads must be a gzipped file')
        sys.exit(1)
    except Exception:
        print('There was a problem with --gene_reads')
        sys.exit(1)
    if gene_hits == 0:
        print('Gene could not be found in given data')
        sys.exit(1)
    try:
        data_viz.boxplot(group_counts, args.output_file, gene_name,
                         group_col_name, 'Gene read counts', groups)
    except SystemExit:
        print('--output_file already exists, please choose a different name')
        sys.exit(1)
    except ValueError:
        print('--output_file is of unsupported type, try a .png')
        sys.exit(1)
def main():

    parser = argparse.ArgumentParser(description='Store key'
                                     'data structures',
                                     prog='insert_key_value_pairs')

    parser.add_argument('--datastructure',
                        type=str,
                        help='Name of '
                        "datastructure to use. Choose from 'hash', "
                        "'binary_tree', or 'avl_tree'",
                        required=True)

    parser.add_argument('--dataset',
                        type=str,
                        help='Name of txt file'
                        ', value pairs',
                        required=True)

    parser.add_argument('--number_keys',
                        type=int,
                        help='Number of keys from'
                        'dataset to read in',
                        required=True)

    args = parser.parse_args()

    datastructure = args.datastructure
    filename = args.dataset
    N = args.number_keys

    if datastructure == 'hash':
        print('initializing')
        hashtable = ht.ChainedHash(10000000, ht.hash_functions.h_rolling)
        insert_t0 = time.time()
        counter = 0
        for line in open(filename, 'r'):
            data = line.rstrip().split('\t')
            hashtable.add(data[0], data[1])
            counter += 1
            if counter == N:
                break
        insert_t1 = time.time()
        search_t0 = time.time()
        counter = 0
        for line in open(filename, 'r'):
            data = line.rstrip().split('\t')
            hashtable.search(data[0])
            counter += 1
            if counter == N:
                break
        search_t1 = time.time()
        print('time to insert: ' + str(insert_t1 - insert_t0))
        print('time to search: ' + str(search_t1 - search_t0))

    elif datastructure == 'binary_tree':
        print('initialize binary tree')
        insert_t0 = time.time()
        datatree = binary_tree.create_tree(filename, N)
        insert_t1 = time.time()
        search_t0 = time.time()
        counter = 0
        for line in open(filename, 'r'):
            data = line.rstrip().split('\t')
            binary_tree.search(datatree, data[0])
            counter += 1
            if counter == N:
                break
        search_t1 = time.time()
        print('time to insert: ' + str(insert_t1 - insert_t0))
        print('time to search: ' + str(search_t1 - search_t0))

    elif datastructure == 'avl_tree':
        print('initialize AVL tree')
        insert_t0 = time.time()
        datatree = avl_tree.create_AVLtree(filename, N)
        insert_t1 = time.time()
        search_t0 = time.time()
        counter = 0
        for line in open(filename, 'r'):
            data = line.rstrip().split('\t')
            avl_tree.search(datatree, data[0])
            counter += 1
            if counter == N:
                break
        search_t1 = time.time()
        print('time to insert: ' + str(insert_t1 - insert_t0))
        print('time to search: ' + str(search_t1 - search_t0))

    else:
        print('does not recognize ')
Exemplo n.º 17
0
def main():
    """Creates box plots of gene expression data from GTEx analysis

    Parameters
    -----------
    --gene_reads_file : A GTEx_Analysis file ending in '.gct.gz'. Contains
    measured gene expression level by tissue type. Input as a string.
    ex. 'GTEx_Analysis_2017-06-05_v8_RNASeQCv1.1.9_gene_reads.acmg_59.gct.gz'

    --sample_input_file : A txt file containing sample identification
    information, corresponding to data in the .gz file. Input as a string.
    ex. 'GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt'

    --group_data_by : Data is displayed by tissue, and can be sorted by
    tissue groups (SMTS) or tissue types (SMTSD). Input either 'SMTS' or
    'SMTSD' as a string.

    --target_gene : The gene of interest to plot. Input as a string. A full
    list of available genes can be found here: https://github.com/swe4s/
    lectures/blob/master/data_integration/gtex/acmg_genes.txt

    --output_file_name: File name to save the output plot. Input as a string
    with the extension .png.

    Returns
    --------

    Function returns a box plot of expression data, saved in the local
    directory as output_file_name.

    """

    parser = argparse.ArgumentParser(description='plot gene expression data '
                                     'from gtex files',
                                     prog='plot_gtex.py')

    parser.add_argument('--gene_reads_file',
                        type=str,
                        help='Name of gene'
                        'count input file',
                        required=True)

    parser.add_argument('--sample_info_file',
                        type=str,
                        help='Name of sample'
                        'info input file',
                        required=True)

    parser.add_argument('--group_data_by',
                        type=str,
                        help='Select either'
                        'tissue groups (SMTS) or tissue types (SMTSD)',
                        required=True)

    parser.add_argument('--target_gene',
                        type=str,
                        help='Gene of interest to '
                        'plot',
                        required=True)

    parser.add_argument('--output_file_name',
                        type=str,
                        help='Name for saved'
                        'output graph',
                        required=True)

    args = parser.parse_args()

    try:
        # file with gene read counts for each sample
        data_file_name = args.gene_reads_file
        # file with informational headers for each sample
        sample_info_file_name = args.sample_info_file
    except FileNotFoundError:
        print('Could not find input data file')
        sys.exit(1)
    except PermissionError:
        print('Could not open input data file')
        sys.exit(1)

    # plot gene expression of tissue groups (SMTS) or tissue types (SMTSD)
    # choice stored in variable 'tissue_selection'
    group_col_name = args.group_data_by

    # gene of interest to plot
    gene_name = args.target_gene

    sample_to_count_map = ht.ChainedHash(1000000, ht.hash_functions.h_rolling)

    version = None
    dim = None
    data_header = None

    gene_name_col = 1

    for l in gzip.open(data_file_name, 'rt'):

        if version == None:
            version = l
            continue

        if dim == None:
            dim = [int(x) for x in l.rstrip().split()]
            continue

        if data_header == None:
            data_header = l.rstrip().split('\t')
            continue

        # remove first and second items from list (not sample ids)
        data_header.pop(0)
        data_header.pop(0)
        sample_ids = data_header

        A = l.rstrip().split('\t')
        print(A[gene_name_col])

        for sample_i in range(len(sample_ids)):
            if A[gene_name_col] == gene_name:
                sample_to_count_map.add(sample_ids[sample_i], A[sample_i])

    samples_to_tissues_map = ht.ChainedHash(1000000,
                                            ht.hash_functions.h_rolling)

    # in new hash table, SAMPID is from column 0, SMTS column 5, SMTSD column

    tissues_list = []

    for l in open(sample_info_file_name):
        line_split = l.rstrip().split('\t')
        if group_col_name == 'SMTS':
            if line_split[5] not in tissues_list:
                tissues_list.append(line_split[5])
            samples_to_tissues_map.add(line_split[0], line_split[5])
        if group_col_name == 'SMTSD':
            if line_split[5] not in tissues_list:
                tissues_list.append(line_split[5])
            samples_to_tissues_map.add(line_split[0], line_split[6])

    tissues_list.pop(0)  # remove SMTS or SMTSD from list

    group_counts = []

    for tissue in tissues_list:
        counts = []
        for sample in sample_ids:
            if samples_to_tissues_map.search(sample) == tissue:
                counts.append(int(sample_to_count_map.search(sample)))
        group_counts.append(counts)

    # ploting with data_viz.py module
    # code will output a list of lists containing gene data for tissue type,
    # and a list of names corresponding to each list of data to box plot

    saved_plot_name = args.output_file_name
    title = str(gene_name)
    x_label = group_col_name
    y_label = "Gene read counts"
    data = group_counts
    x_ticks = tissues_list

    data_viz.boxplot(saved_plot_name, title, x_label, y_label, data, x_ticks)
def main():
    """
    test data structures for storing key, value pairs

    Arguments
    ---------
    --datastructure: the datastructure to build storing desired key, value
    pairs. Choose from 'hash', 'binary_tree', or 'avl_tree'.

    --dataset: a tab-separated txt file containing lines of key, value
    pairs to store

    --number_keys: the number of keys from dataset to read in

    Returns
    -------
    The specified data structure containing all key, value pairs. Also prints
    the elapsed time to insert all keys and elapsed time to search for all
    keys.

    """
    parser = argparse.ArgumentParser(description='Store key, value pairs in '
                                     'data structures',
                                     prog='insert_key_value_pairs')

    parser.add_argument('--datastructure',
                        type=str,
                        help='Name of '
                        "datastructure to use. Choose from 'hash', "
                        "'binary_tree', or 'avl_tree'",
                        required=True)

    parser.add_argument('--dataset',
                        type=str,
                        help='Name of txt file with key'
                        ', value pairs',
                        required=True)

    parser.add_argument('--number_keys',
                        type=int,
                        help='Number of keys from'
                        'dataset to read in',
                        required=True)

    args = parser.parse_args()

    datastructure = args.datastructure
    filename = args.dataset
    N = args.number_keys

    if datastructure == 'hash':
        # call hash tables submodule
        print('initialize hash table')
        hashtable = ht.ChainedHash(10000000, ht.hash_functions.h_rolling)
        # measure time to insert all keys in file
        insert_t0 = time.time()
        counter = 0
        for line in open(filename, 'r'):
            data = line.rstrip().split('\t')
            hashtable.add(data[0], data[1])
            counter += 1
            if counter == N:
                break
        insert_t1 = time.time()
        # measure time to search for all keys
        search_t0 = time.time()
        counter = 0
        for line in open(filename, 'r'):
            data = line.rstrip().split('\t')
            hashtable.search(data[0])
            counter += 1
            if counter == N:
                break
        search_t1 = time.time()
        print('time to insert: ' + str(insert_t1 - insert_t0))
        print('time to search: ' + str(search_t1 - search_t0))

    elif datastructure == 'binary_tree':
        # call binary_tree tree function
        print('initialize binary tree')
        # measure time to insert all keys in file
        insert_t0 = time.time()
        datatree = binary_tree.create_tree(filename, N)
        insert_t1 = time.time()
        # measure time to search for keys
        search_t0 = time.time()
        counter = 0
        for line in open(filename, 'r'):
            data = line.rstrip().split('\t')
            binary_tree.search(datatree, data[0])
            counter += 1
            if counter == N:
                break
        search_t1 = time.time()
        print('time to insert: ' + str(insert_t1 - insert_t0))
        print('time to search: ' + str(search_t1 - search_t0))

    elif datastructure == 'avl_tree':
        # call avl_tree tree function
        print('initialize AVL tree')
        # measure time to insert all keys in file
        insert_t0 = time.time()
        datatree = avl_tree.create_AVLtree(filename, N)
        insert_t1 = time.time()
        # measure time to search for keys
        search_t0 = time.time()
        counter = 0
        for line in open(filename, 'r'):
            data = line.rstrip().split('\t')
            avl_tree.search(datatree, data[0])
            counter += 1
            if counter == N:
                break
        search_t1 = time.time()
        print('time to insert: ' + str(insert_t1 - insert_t0))
        print('time to search: ' + str(search_t1 - search_t0))

    else:
        print('does not recognize datastructure name')
Exemplo n.º 19
0
def main():
    """main function"""
    data_file_name = args.gzfile
    sample_info_file_name = args.txtfile
    group_col_name = args.group_type
    gene_name = args.gene

    sample_id_col_name = 'SAMPID'

    samples = []
    sample_info_header = None
    for l in open(sample_info_file_name):
        if sample_info_header is None:
            sample_info_header = l.rstrip().split('\t')
        else:
            samples.append(l.rstrip().split('\t'))

    group_col_idx = linear_search(group_col_name, sample_info_header)
    sample_id_col_idx = linear_search(sample_id_col_name, sample_info_header)

    groups = []
    members = []

    names = []

    MemTable = ht.ChainedHash(35, hf.h_rolling)

    for row_idx in range(len(samples)):
        sample = samples[row_idx]
        sample_name = sample[sample_id_col_idx]
        curr_group = sample[group_col_idx]
        names = names + [curr_group]

        curr_group_idx = linear_search(curr_group, groups)

        if curr_group_idx == -1:
            curr_group_idx = len(groups)
            groups.append(curr_group)
            members.append([])

        members[curr_group_idx].append(sample_name)
        MemTable.add(curr_group, sample_name)

    nameset = list(dict.fromkeys(names).keys())

    version = None
    dim = None
    data_header = None

    gene_name_col = 1

    Table1 = ht.ChainedHash(len(nameset), hf.h_rolling)

    group_counts = [[] for i in range(len(groups))]

    for l in gzip.open(data_file_name, 'rt'):
        if version is None:
            version = l
            continue

        if dim is None:
            dim = [int(x) for x in l.rstrip().split()]
            continue

        if data_header is None:
            data_header = []
            i = 0
            for field in l.rstrip().split('\t'):
                data_header.append([field, i])
                i += 1
            data_header.sort(key=lambda tup: tup[0])

            continue

        A = l.rstrip().split('\t')

        if A[gene_name_col] == gene_name:
            for group_idx in range(len(groups)):
                ii = 0.0
                jj = 0.0
                for member in members[group_idx]:
                    t00_binary = time.time()
                    member_idx = binary_search(member, data_header)
                    t01_binary = time.time()
                    ii = ii + 1
                    if member_idx != -1:
                        jj = jj + 1
                        t0_hash = time.time()
                        Table1.add(groups[group_idx], int(A[member_idx]))
                        t1_hash = time.time()
            break

    binarytime = t01_binary - t00_binary
    print("Binary Time")
    print(binarytime * ii)
    hashtime = t1_hash - t0_hash
    print("Hash Time")
    print(hashtime * jj)

    group_counts = [[] for i in range(len(groups))]
    i = 0
    for key in np.unique(Table1.keys):
        group_counts[i].append(Table1.search(key))
        i = i + 1

    g = data_viz.boxplot(group_counts, sorted(nameset), group_col_name,
                         gene_name, args.outfile)
Exemplo n.º 20
0
 def test_search_function(self):
     test = ht.ChainedHash(50, hf.h_ascii)
     test.add('text', 'value')
     self.assertEqual(test.search('text'), 'value')
Exemplo n.º 21
0
 def test_no_overwrite(self):
     test = ht.ChainedHash(50, hf.h_ascii)
     test.add('text', 'value')
     test.add('text', 'newvalue')
     self.assertEqual(test.T[3][0][1], 'value')
     self.assertEqual(test.T[3][1][1], 'newvalue')
Exemplo n.º 22
0
 def testChainedHash_search_in_table_python(self):
     test = hash_tables.ChainedHash(10, hash_functions.h_python)
     for i in range(5):
         test.add(str(i), 2 * i)
     self.assertEqual(test.search('3'), 6)
Exemplo n.º 23
0
 def test_search_two_values_one_key(self):
     test = ht.ChainedHash(5, hf.h_ascii)
     test.add('text', 'value')
     test.add('blet', 'newvalue')
     self.assertEqual(test.search('text'), 'value')
     self.assertEqual(test.search('blet'), 'newvalue')
Exemplo n.º 24
0
 def testChainedHash_add_to_empty_ascii(self):
     x = random.randint(0, 100)
     y = hash_functions.h_ascii
     test = hash_tables.ChainedHash(x, y)
     self.assertTrue(test.add('key', 10))
Exemplo n.º 25
0
 def test_chained_hash_search_1(self):
     table = hash_tables.ChainedHash(hash_functions.h_ascii, 100)
     table.insert('woah!', 1)
     assert(table.search('woah!') == 1)
 def test_chained_hash_bad_fxn(self):
     self.assertRaises(TypeError, lambda: ht.ChainedHash(5, None))
     self.assertRaises(TypeError, lambda: ht.ChainedHash(5, 'string'))
     self.assertRaises(TypeError, lambda: ht.ChainedHash(5, int(5)))
     self.assertRaises(TypeError, lambda: ht.ChainedHash(5, float(420.69)))
Exemplo n.º 27
0
 def test_chained_hash_replace_key(self):
     table = hash_tables.ChainedHash(hash_functions.h_ascii, 30)
     table.insert('ayo', 10)
     table.insert('ayo', 100)
     assert table.capacity == 1
     assert table.search('ayo') == 100
 def test_chained_hash_add_key_none(self):
     test_table = ht.ChainedHash(5, hf.h_ascii)
     self.assertEqual(None, test_table.add(None, 420))
def main():
    args = initialize()

    # check if the input files exist
    if (not os.path.exists(args.sample_attributes)):
        print('Metadata file not found')
        sys.exit(1)
    if (not os.path.exists(args.gene_reads)):
        print('Gene data file not found')
        sys.exit(1)

    target_gene_name = args.gene
    metadata_header = None

    if args.data_structure == 'parallel':
        samples, target_group = [], []   # only for parallel array
    elif args.data_structure == 'hash':
        target_group = []
        ht_meta = hash_tables.ChainedHash(100000, hash_functions.h_rolling)
    else:
        print('Please input data structures available.')
        print('Options available include "parallel" and "hash".')
        sys.exit(1)

    for l in open(args.sample_attributes):
        sample_info = l.rstrip().split('\t')

        if metadata_header is None:
            metadata_header = sample_info
            continue

        sample_idx = linear_search('SAMPID', metadata_header)
        target_idx = linear_search(args.group_type, metadata_header)
        if (target_idx == -1):
            break   # no such group

        if args.data_structure == 'parallel':
            samples.append(sample_info[sample_idx])       # ID
            target_group.append(sample_info[target_idx])  # group type
        elif args.data_structure == 'hash':
            key = sample_info[target_idx]                 # group type
            value = sample_info[sample_idx]               # ID
            search = ht_meta.search(key)
            if search is None:
                ht_meta.add(key, [value])  # map ID and group
                target_group.append(key)
            else:
                search.append(value)

    if len(target_group) == 0:
        print('Group type not found')
        sys.exit(1)

    version, dim, rna_header = None, None, None

    for l in gzip.open(args.gene_reads, 'rt'):

        if version is None:
            version = l
            continue

        if dim is None:
            dim = l
            continue

        if rna_header is None:
            rna_header = l.rstrip().split('\t')
            rna_header_plus_index = []
            for i in range(len(rna_header)):
                rna_header_plus_index.append([rna_header[i], i])
            rna_header_plus_index.sort()
            continue

        rna_counts = l.rstrip().split('\t')
        description_idx = linear_search('Description', rna_header)

        if description_idx == -1:
            print('No genes found in the header')
            sys.exit(1)

        if rna_counts[description_idx] == target_gene_name:
            if args.data_structure == 'parallel':
                attrs = list(set(target_group))
                attrs.sort()
                par_array = []
                # search_start = time.time()
                for attr in attrs:
                    attr_idxs = linear_search_all_hits(attr, target_group)

                    attr_counts = []
                    for attr_idx in attr_idxs:
                        rna_header_idx = linear_search(samples[attr_idx],
                                                       rna_header)
                        # rna_header_idx = binary_search(samples[attr_idx],
                        #                                rna_header_plus_index)
                        if rna_header_idx == -1:
                            continue
                        count = rna_counts[rna_header_idx]
                        attr_counts.append(int(count))
                    par_array.append(attr_counts)
                data_viz.boxplot(par_array, target_group, args.group_type,
                                 'Gene read counts', target_gene_name,
                                 args.output_file)
                # search_end = time.time()
                # print(search_end - search_start)
                sys.exit(0)

            elif args.data_structure == 'hash':
                counts_list = []
                ht_rna = hash_tables.ChainedHash(
                    100000, hash_functions.h_rolling)
                for i in range(description_idx + 1, len(rna_header)):
                    # map ID and counts
                    ht_rna.add(rna_header[i], int(rna_counts[i]))
                target_group.sort()
                for attr in target_group:
                    attr_counts = []
                    sampID = ht_meta.search(attr)
                    if sampID is None:
                        continue
                    for ID in sampID:
                        count = ht_rna.search(ID)
                        if count is None:
                            continue
                        attr_counts.append(count)
                    counts_list.append(attr_counts)
                data_viz.boxplot(counts_list, target_group, args.group_type,
                                 'Gene read counts', target_gene_name,
                                 args.output_file)
                sys.exit(0)
    sys.exit(0)
Exemplo n.º 30
0
def main():
    args = initialize()

    if args.number_pairs <= 1 or args.number_pairs > 10000:
        print('The number of key/value pairs should be in the range of 2 to \
            10000.')
        sys.exit(1)

    if not os.path.exists(args.dataset):
        print('Input dataset not found.')
        sys.exit(1)
    else:
        f = open(args.dataset, 'r')
        lines = f.readlines()
        f.close()

    t_insert, t_search, t_search_non = [], [], []  # just for plotting

    if args.data_structure == 'hash' or args.data_structure == 'all':
        print('\nResults of the hash table')
        print('=========================')
        # key insertion
        table = hash_tables.ChainedHash(10 * int(args.number_pairs),
                                        hash_functions.h_rolling)
        i = 0  # number of pairs taken in / line number
        key_list = []
        start = time.time()
        for l in lines:
            key = l.split(' ')[0]
            value = l.split(' ')[1]
            key_list.append(key)
            if i < args.number_pairs:
                table.add(key, value)
                i += 1
            else:
                break
        end = time.time()
        t_insert.append(end - start)
        print(
            'It requires %8.7f seconds to insert %s keys to the hash table.' %
            ((end - start), args.number_pairs))

        # searching existing keys
        start = time.time()
        for key in key_list:
            table.search(key)
        end = time.time()
        t_search.append(end - start)
        print('It requires %8.7f seconds to search for all the %s keys inerted\
            just now in the hash table.' % ((end - start), args.number_pairs))

        # searching non-existing keys
        start = time.time()
        for key in key_list:
            table.search(key + '_non')
        end = time.time()
        t_search_non.append(end - start)
        print('It requires %8.7f seconds to search for %s non-existing keys in\
            the hash table.\n' % ((end - start), args.number_pairs))

    if args.data_structure == 'AVL' or args.data_structure == 'all':
        print('Results of the AVL tree')
        print('=======================')
        # key insertion
        avl_tree = avl.AVLTree()
        i = 0  # number of pairs taken in / line number
        key_list = []
        start = time.time()
        for l in lines:
            key = l.split(' ')[0]
            value = l.split(' ')[1]
            key_list.append(key)
            if i < args.number_pairs:
                avl_tree.insert(key, value)
                i += 1
        end = time.time()
        t_insert.append(end - start)
        print('It requires %8.7f seconds to insert %s keys to the AVL tree.' %
              ((end - start), args.number_pairs))

        # searching existing keys
        start = time.time()
        for key in key_list:
            avl_tree.search(key)
        end = time.time()
        t_search.append(end - start)
        print('It requires %8.7f seconds to search for all the %s keys inerted\
            just now in the AVL tree.' % ((end - start), args.number_pairs))

        # searching non-existing keys
        start = time.time()
        for key in key_list:
            avl_tree.search(key + '_non')
        end = time.time()
        t_search_non.append(end - start)
        print('It requires %8.7f seconds to search for %s non-existing keys in\
            the AVL tree.\n' % ((end - start), args.number_pairs))

    if args.data_structure == 'tree' or args.data_structure == 'all':
        print('Results of the binary tree')
        print('==========================')
        # key insertion
        i = 0  # number of pairs taken in / line number
        key_list = []
        start = time.time()
        for l in lines:
            key = l.split(' ')[0]
            value = l.split(' ')[1]
            key_list.append(key)
            if i < args.number_pairs:
                if i == 0:
                    root = bt.Node(key, value)
                    i += 1
                else:
                    bt.insert(root, key, value)
                    i += 1
            else:
                break
        end = time.time()
        t_insert.append(end - start)
        print(
            'It requires %8.7f seconds to insert %s keys to the binary tree.' %
            ((end - start), args.number_pairs))

        # searching existing keys
        start = time.time()
        for key in key_list:
            bt.search(root, key)
        end = time.time()
        t_search.append(end - start)
        print('It requires %8.7f seconds to search for all the %s keys inerted\
            just now in the binary tree.' % ((end - start), args.number_pairs))

        # searching non-existing keys
        start = time.time()
        for key in key_list:
            bt.search(root, key + '_non')
        end = time.time()
        t_search_non.append(end - start)
        print('It requires %8.7f seconds to search for %s non-existing keys in\
            the binary tree.\n' % ((end - start), args.number_pairs))

    # Plot a bar chart if "all" is selected
    if args.data_structure == 'all':
        rc(
            'font', **{
                'family': 'sans-serif',
                'sans-serif': ['DejaVu Sans'],
                'size': 10
            })
        # Set the font used for MathJax - more on this later
        rc('mathtext', **{'default': 'regular'})
        plt.rc('font', family='serif')

        n_groups = 3  # 3 different data structures
        fig, ax = plt.subplots()
        index = np.arange(n_groups)
        bar_width = 0.25

        data1 = plt.bar(index,
                        t_insert,
                        bar_width,
                        alpha=0.8,
                        label='Insertion')
        data2 = plt.bar(index + bar_width,
                        t_search,
                        bar_width,
                        alpha=0.8,
                        label='Searching\n existing keys')
        data3 = plt.bar(index + 2 * bar_width,
                        t_search_non,
                        bar_width,
                        alpha=0.8,
                        label='Searching\n non-existing keys')

        if 'rand' in args.dataset:
            keyword = args.dataset.split('.')[0] + 'om'
        else:
            keyword = args.dataset.split('.')[0]

        plt.title('Manipulation of %s %s key-value pairs' %
                  (args.number_pairs, keyword),
                  weight='semibold')
        plt.xlabel('Data structures', weight='semibold')
        plt.ylabel('Time required (s)', weight='semibold')
        plt.xticks(index + bar_width,
                   ('Hash table', 'AVL tree', 'Binary tree'))
        plt.legend()
        plt.tight_layout()
        plt.grid(True)
        plt.savefig('Benchmark_%s_%s.png' % (keyword, args.number_pairs))
        plt.show()