예제 #1
0
def _part_2(past, args=None, print=print):
    try:
        # _________________________________
        print('\n_____________________________________')
        print('PART 2: FIND UNAM AND AMBS')
        print('_____________________________________\n')
        # _________________________________
        au = None

        if not args.ambs:
            ambs, unam = split_sequence(filename=args.genome)
            au = rb_tree_ambs(ambs, unam)

            print('args and unam successfully split\n')
            past = get_time(past, print=print)

        else:
            print("Ambs unam file exists: ",
                  args.ambs if args.ambs else args.unam)

            a_u_dict = msgunpack_dict(args.ambs if args.ambs else args.unam)

            print('AMBS and UNAM Unpacked\n')
            past = get_time(past, print=print)

        assert au
        return au, past

    except Exception as e:
        raise
예제 #2
0
def main():
    size = 1000000000

    s_array = random.sample(population=range(size), k=size)

    past = time.time()

    inverse1(s_array=s_array)
    print('inverse1 completed.')
    past = get_time(past=past)
    print()

    inverse2(s_array=s_array)
    print('inverse2 completed.')
    past = get_time(past=past)
    print()

    inverse3(s_array)
    print('inverse3 completed')
    past = get_time(past=past)
    print()
예제 #3
0
def _part_0(args=None, print=print):
    try:
        start = time.time()
        # _____________________________________________
        print('\n_____________________________________')
        print('PART 0: READ ARGS AND GENOME/SA FILES')
        print('_____________________________________\n')
        # _____________________________________________

        past = start
        print('reading SA...\n')

        # read suffix array from bytes to ints
        # reading with numpy then converting to 1-D array much slower than array.array
        # however, array cannot read files larger than ~3GB

        s_array = read_byte_numpy(filename=args.SA)
        print('SA read.\n')
        past = get_time(past, print=print)
        print('reading genome...\n')

        # read with Reads instead
        # ! genome has ambs
        #genome = reads(filename=args.genome)
        chrs, genome = chr_splits(filename=args.genome)

        json_it(data=chrs,
                filename=append_file_name(args.outfile + "json_chrs"))

        print('genome read.\n')
        past = get_time(past, print=print)

        # TODO: change below line as necessary
        # args.LCPfile = '../data/lcp_pickle'

        return genome, past, s_array, start

    except Exception as e:
        raise
예제 #4
0
파일: kasai.py 프로젝트: joeskan/uniquekmer
def kasai(genome, s_array, inv_suff=None, verbose=False, print=print) -> tuple:
    """
        returns an unordered dictionary with SA as keys and LCP as values
    :param genome:
    :param s_array:
    :return:


    consumes too much memory using dict for sa_lcp_dict
    """

    past = time.time()

    if verbose:
        print('Truncating sequence to A/C/G/T')

    # (1): truncate genome to only A/C/G/T
    if 'N' in genome:
        print('N found in genome for kasai(). Parsing for unambiguous only')

        splits = genome.split('N')
        genome = ''
        for part in splits:
            genome += part

    size = len(s_array)
    gen_size = len(genome)

    if verbose:
        print('Truncation done.')
        past = get_time(past,print)
        print("SA size: %s", str(size))
        print("genome size: %s", str(gen_size))

    lcp_arr = [0] * size

    if verbose:
        print("Empty List created for LCP")
        print("Creating Empty Lists for Inverse Suffix Array")

    if not inv_suff:
        inv_suff = inverse1(s_array=s_array)
        inv_suff = inv_suff.astype(int)

    inv_suff = inv_suff.tolist()

    if verbose:
        # ____________________________________ #
        print("Inverse Suffix Array Completed")
        past = get_time(past, print)
        # ____________________________________ #

    # length of previous lcp
    k = 0

    s_array = s_array.tolist()

    for i in trange(size, desc="Computing LCP"):

        sa = inv_suff[i]
        if sa == size - 1:
            k = 0
            # sa_lcp_dict[sa] = k
            continue

        if type(sa) != int:
            sa = int(sa)

        j = s_array[sa + 1]

        while i + k < gen_size and j + k < gen_size and genome[i + k] == genome[j + k]:
            k += 1
        # sa_lcp_dict[sa] = k
        lcp_arr[sa] = k

        if k > 0:
            k -= 1

    if verbose:
        print("LCP Array Completed")
        get_time(past)

    # return sa_lcp_dict
    return inv_suff, lcp_arr
예제 #5
0
def _part_3(lcp, au: RedBlackTree, past, inv_suff, args=None, print=print):
    try:
        # ____________________________________
        print('\n_____________________________________')
        print('PART 3: VALIDATE STARTING ADDRESSES')
        print('_____________________________________\n')
        # ____________________________________

        true_addresses = []
        # tops = []
        unique_starts = []

        for tup in true_address_with_sort(lcp=lcp,
                                          au=au,
                                          top=args.length,
                                          bot=args.low,
                                          distance=args.distance,
                                          inv_suff=inv_suff):
            true_addresses.append(tup[0])
            # tops.append(tup[1])
            unique_starts.append(tup[1])

        print('valid addresses calculated\n')
        past = get_time(past, print=print)
        # d_sa = within_distance(in_dict=d_sa, top=args.length, distance=args.distance)
        print('addresses within %s%s', str(args.distance), ' calculated')
        past = get_time(past, print=print)

        filename = append_file_name(filename=args.outfile + 'true_addresses')
        # MSGPACK DOES NOT PRESERVE ORDER
        print('saving true addresses as byte file')
        write_array_to_byte(filename=filename, byte_arr=true_addresses)
        print('saving tops as byte file')
        # filename = append_file_name(filename=args.outfile + 'tops')
        # write_array_to_byte(filename=filename, byte_arr=tops)
        print('saving unique starts as byte file')
        filename = append_file_name(filename=args.outfile + 'unique_starts')
        write_array_to_byte(filename=filename, byte_arr=unique_starts)
        # json_it(d_sa, filename)

        print('default dict msgpack\'ed\n')
        get_time(past, print=print)

        # delete lcp file if final file successfully written
        # don't delete an input file the user has specified
        # if not args.lcpfile:
        #    filename = append_file_name(args.lcpfile + 'json_lcp')
        #    os.remove(filename)

        # write_dictionary(in_dict=d_sa, filename='../default_d_sa_json')

        # print('wrote dictionary to json file\n')

        return

    except InsufficientArguments as e:
        print("Insufficient number of arguments passed!")

    except MemoryError:
        print_memory_usage()
        raise
예제 #6
0
def _part_1(genome, past, s_array, args=None, print=print):
    try:
        # check if args.LCPfile exists
        # if it does, read the pickle file instead of calculating new lcp
        inv_suff = []

        # ___________________________________________
        print('\n_____________________________________')
        print('PART 1: COMPUTE LCP ARRAY')
        print('_____________________________________\n')
        # ____________________________________________

        # if user has specified a LCP pickle file that already exists
        if args.lcpfile and os.path.isfile(path=args.lcpfile):

            print("uniques file exists: ")
            #print(args.lcpfile, '\n')

            # TODO: change this as necessary
            #   hopefully start_uniques will be pickled/jsom/msgpacked in the future
            #lcp = unpickle_dict(filename=args.lcpfile)
            lcp = unjson_it(args.lcpfile)
            # find out what format the lcp was pickled
            if type(lcp) == dict or type(lcp) == OrderedDict:
                key = lcp.keys()[0]
                value = lcp[key]

                if key > value and (100 >= value >= 20):
                    print("old lcp pickle was in format sa:lcp")
                    lcp = deque(lcp.values())

                elif key < value and (100 >= key >= 20):
                    print("old lcp pickle was in format lcp:sa")
                    lcp = deque(lcp.keys())

                else:
                    print("not sure what's going on here for sa_lcp dict")
                    raise KeyboardInterrupt

                s_array = deque(s_array)

            elif type(lcp) == list:
                print('LCP file read as list')

            print("uniques unpacked\n")
            past = get_time(past, print=print)
            print("Computing Unique Start Lengths")

            # combine sa and lcp to form a dict with keys: sa, values: unique_starts
            # TODO: creating OrderedDict consumes too much memory

            filename = append_file_name('json_lcp')
            if args.outfile:
                filename = args.outfile

            if args.inverse:
                inv_suff = unjson_it(args.inverse)
            else:
                inv_suff = inverse1(s_array=s_array)

        else:
            if args.inverse:
                inv_suff = unjson_it(args.inverse)
            inv_suff, lcp = kasai(genome=genome,
                                  inv_suff=inv_suff,
                                  s_array=s_array,
                                  print=print)
            past = time.time()

            # convert suffix array (list) to suffix array (deque) for increased efficiency
            print('Completed.')

            # json it
            filename = append_file_name('json_lcp')
            if args.outfile:
                filename = append_file_name(args.outfile + 'json_lcp')

            print('json\'ing lcp array to %s', filename)
            json_it(data=lcp, filename=filename)

            print('LCP json\'ed!')
            past = get_time(past, print=print)

        return past, lcp, inv_suff

    except Exception as e:
        raise