def crawler(filename: str, geno_file: str): # goes through ambs/unam json file to ensure that the validity of ambs/unam groups a_u_dict = fm.unjson_it(filename) seq = fm.reads(geno_file) max = len(seq) u_offset = 0 a_offset = 0 # important to note that ambs measures length and not position for una in a_u_dict: # key = unam # value = ambs u_offset += int(una) a_offset += (int(a_u_dict[una]) + int(una) - 1) if u_offset != 0: assert seq[u_offset] in ['A', 'C', 'G', 'T'] if u_offset < max: assert seq[u_offset + 1] == 'N' assert seq[a_offset] == 'N' if a_offset < max: assert seq[a_offset + 1] in ['A', 'C', 'G', 'T'] u_offset += int(a_u_dict[una]) return
def read_json_file(filename: str): a_u_dict = fm.unjson_it(filename) with open('ambs', 'w') as ambs: with open('unam', 'w') as unam: for key in a_u_dict: unam.write(str(key) + '\n') ambs.write(str(a_u_dict[key]) + '\n')
def read_trues_chr(true_file: str, chrs_file: str): print("Reading data: ") print("Reading True Address file: ", end="") trues = read_byte_to_queue(true_file) print("done.\nReading chrs file: ", end='') chrs_dict = unjson_it(chrs_file) print("done.") return trues, chrs_dict
def read_dict(filename: str, filetype='pickle'): if filetype.lower() == 'pickle': return unpickle_dict(filename) elif filetype.lower() == 'json': return unjson_it(filename) elif filetype.lower() == 'msgpack': return msgunpack_dict(filename=filename) else: raise Exception('unknown file type')
def read_data(true_file: str, tops_file: str, lcps_file: str, chrs_file: str): print("Reading data: ") print("Reading True Address file: ", end="") trues = read_byte_to_queue(true_file) print("done.\nReading Tops file: ", end="") tops = read_byte_to_queue(tops_file) print("done.\nReading Unique Start file: ", end='') lcps = read_byte_to_queue(lcps_file) print("done.\nReading chrs file: ", end='') chrs_dict = unjson_it(chrs_file) print("done.") return trues, tops, lcps, chrs_dict
def test_validity(): # assumes that mu's have already been json'ed mu = fm.unjson_it("../src/c22_mu") geno = fm.read_unambiguous("../data/22.fa") myd = {} for key in tqdm(mu, desc="checking uniqueness"): seq = geno[int(key):mu[key] + 1] assert seq not in myd myd[seq] = 1 if key == "700": myd["ACGT"]
import sys import os from tqdm import tqdm sys.path.append(os.getcwd().split('uniquekmer')[0] + 'uniquekmer/src') import file_manager as fm filename = '../22_json_default_dict' print('reading jsoned dict') d = fm.unjson_it(filename) print('read!') print('reading genome') genome = fm.reads('../data/22.fa') print('read!') ambs=0 tops=0 # checking how many tops are less than 100 for sa in tqdm(d, desc='checking dict'): top = int(d[sa][1]) sa = int(sa) string = genome[sa:sa+top] if 'N' in string: ambs+=1 if top < 100:
def temp(): c22 = unjson_it(filename="c22_mu") l0 = list(c22.keys()) l1 = list(c22.values()) just_dump(l0, l1, fn="c22_mu_just_dump")
def _part_1(genome, past, s_array, args=None, print=print): try: # check if args.LCPfile exists # if it does, read the pickle file instead of calculating new lcp inv_suff = [] # ___________________________________________ print('\n_____________________________________') print('PART 1: COMPUTE LCP ARRAY') print('_____________________________________\n') # ____________________________________________ # if user has specified a LCP pickle file that already exists if args.lcpfile and os.path.isfile(path=args.lcpfile): print("uniques file exists: ") #print(args.lcpfile, '\n') # TODO: change this as necessary # hopefully start_uniques will be pickled/jsom/msgpacked in the future #lcp = unpickle_dict(filename=args.lcpfile) lcp = unjson_it(args.lcpfile) # find out what format the lcp was pickled if type(lcp) == dict or type(lcp) == OrderedDict: key = lcp.keys()[0] value = lcp[key] if key > value and (100 >= value >= 20): print("old lcp pickle was in format sa:lcp") lcp = deque(lcp.values()) elif key < value and (100 >= key >= 20): print("old lcp pickle was in format lcp:sa") lcp = deque(lcp.keys()) else: print("not sure what's going on here for sa_lcp dict") raise KeyboardInterrupt s_array = deque(s_array) elif type(lcp) == list: print('LCP file read as list') print("uniques unpacked\n") past = get_time(past, print=print) print("Computing Unique Start Lengths") # combine sa and lcp to form a dict with keys: sa, values: unique_starts # TODO: creating OrderedDict consumes too much memory filename = append_file_name('json_lcp') if args.outfile: filename = args.outfile if args.inverse: inv_suff = unjson_it(args.inverse) else: inv_suff = inverse1(s_array=s_array) else: if args.inverse: inv_suff = unjson_it(args.inverse) inv_suff, lcp = kasai(genome=genome, inv_suff=inv_suff, s_array=s_array, print=print) past = time.time() # convert suffix array (list) to suffix array (deque) for increased efficiency print('Completed.') # json it filename = append_file_name('json_lcp') if args.outfile: filename = append_file_name(args.outfile + 'json_lcp') print('json\'ing lcp array to %s', filename) json_it(data=lcp, filename=filename) print('LCP json\'ed!') past = get_time(past, print=print) return past, lcp, inv_suff except Exception as e: raise