def temp_forward_unique_check(): geno = read_unambiguous(filename=PATH + FORWARD) # comment() s_arr = read_byte_numpy(append_file_name('data/22.sa')) inv_suff, lcp = kasai(geno, s_arr) myd = {} for num in range(len(s_arr)): myd[s_arr[num]] = lcp[num] #trues0 = list(get_uniques(lcp)) json_it(trues0, "c22_forward_uniques")
def _part_0(args=None, print=print): try: start = time.time() # _____________________________________________ print('\n_____________________________________') print('PART 0: READ ARGS AND GENOME/SA FILES') print('_____________________________________\n') # _____________________________________________ past = start print('reading SA...\n') # read suffix array from bytes to ints # reading with numpy then converting to 1-D array much slower than array.array # however, array cannot read files larger than ~3GB s_array = read_byte_numpy(filename=args.SA) print('SA read.\n') past = get_time(past, print=print) print('reading genome...\n') # read with Reads instead # ! genome has ambs #genome = reads(filename=args.genome) chrs, genome = chr_splits(filename=args.genome) json_it(data=chrs, filename=append_file_name(args.outfile + "json_chrs")) print('genome read.\n') past = get_time(past, print=print) # TODO: change below line as necessary # args.LCPfile = '../data/lcp_pickle' return genome, past, s_array, start except Exception as e: raise
def naive_lcp_22(): s_arr = fm.read_byte_numpy(filename=fm.append_file_name('data/22.sa')) lcp = test_kasai.naive_lcp(s_array=s_arr,T=simple_genome()) fm.json_it(data=lcp,filename=fm.append_file_name('output/naive_lcp_22'))
def mu_driver(): """ similar function as driver.py, except include minimal uniques instead of finding 20-100 uniquemers :return: """ try: # gitignore() print('reading original genome: ', end='') chrs, geno = chr_splits(filename=PATH + ORIGINAL) json_it(chrs, append_file_name("json_chrs")) del chrs print('done.\nreading original SA...: ', end='') s_arr = read_byte_numpy(append_file_name('data/genome.sa')) lcp1 = kasai(geno, s_arr)[1] d1 = OrderedDict(mu(SA=s_arr, LCP=lcp1)) del lcp1 del s_arr au = _part_2(genome_file_name=PATH + ORIGINAL) print("au list: ", list(au)) # ************************* # (2) flipped # ************************* print("performing flips: ") geno2 = read_unambiguous(PATH + FLIPPED) s_arr2 = read_byte_numpy(append_file_name('data/flippedGeno.sa')) lcp2 = kasai(geno2, s_arr2)[1] del geno2 mu_result = dict(compare(d=d1, SA=s_arr2, LCP=lcp2)) del lcp2 mu_result = OrderedDict(sort_mu(mu_result)) mu_result = OrderedDict(true_address_dict(mu_result, au)) json_it(mu_result, append_file_name(files['MU_RESULT'])) #contigs = list(find_contigs(d=old_mu_result_without_true_addresses, bot=20, top=100)) contigs = OrderedDict( find_perfect_contigs(d=mu_result, bot=20, top=100)) json_it(contigs, append_file_name(files['PERFECT_CONTIGS'])) contigs = list(within_distance(d=contigs, distance=300)) json_it(contigs, append_file_name(files['PERFECT_CONTIGS_WITH_DISTANCE'])) print("number of contigs: ", len(contigs)) print("done") except Exception as e: raise
def _part_1(genome, past, s_array, args=None, print=print): try: # check if args.LCPfile exists # if it does, read the pickle file instead of calculating new lcp inv_suff = [] # ___________________________________________ print('\n_____________________________________') print('PART 1: COMPUTE LCP ARRAY') print('_____________________________________\n') # ____________________________________________ # if user has specified a LCP pickle file that already exists if args.lcpfile and os.path.isfile(path=args.lcpfile): print("uniques file exists: ") #print(args.lcpfile, '\n') # TODO: change this as necessary # hopefully start_uniques will be pickled/jsom/msgpacked in the future #lcp = unpickle_dict(filename=args.lcpfile) lcp = unjson_it(args.lcpfile) # find out what format the lcp was pickled if type(lcp) == dict or type(lcp) == OrderedDict: key = lcp.keys()[0] value = lcp[key] if key > value and (100 >= value >= 20): print("old lcp pickle was in format sa:lcp") lcp = deque(lcp.values()) elif key < value and (100 >= key >= 20): print("old lcp pickle was in format lcp:sa") lcp = deque(lcp.keys()) else: print("not sure what's going on here for sa_lcp dict") raise KeyboardInterrupt s_array = deque(s_array) elif type(lcp) == list: print('LCP file read as list') print("uniques unpacked\n") past = get_time(past, print=print) print("Computing Unique Start Lengths") # combine sa and lcp to form a dict with keys: sa, values: unique_starts # TODO: creating OrderedDict consumes too much memory filename = append_file_name('json_lcp') if args.outfile: filename = args.outfile if args.inverse: inv_suff = unjson_it(args.inverse) else: inv_suff = inverse1(s_array=s_array) else: if args.inverse: inv_suff = unjson_it(args.inverse) inv_suff, lcp = kasai(genome=genome, inv_suff=inv_suff, s_array=s_array, print=print) past = time.time() # convert suffix array (list) to suffix array (deque) for increased efficiency print('Completed.') # json it filename = append_file_name('json_lcp') if args.outfile: filename = append_file_name(args.outfile + 'json_lcp') print('json\'ing lcp array to %s', filename) json_it(data=lcp, filename=filename) print('LCP json\'ed!') past = get_time(past, print=print) return past, lcp, inv_suff except Exception as e: raise