def multi_process(): util = Util.Utils() logic_prep = LogicPrep.LogicPreps() brcd_df = Util.Utils().read_excel_to_df(WORK_DIR + IN + BRCD_FL) # key : TTTTTT+barcode, val : Target length brcd_dict = LogicPrep.LogicPreps().make_df_to_dict(brcd_df, 0, 1) logic = Logic.Logics(INIT, brcd_dict) sources = util.get_files_from_dir(WORK_DIR + FASTQ + "*.fastq") for path in sources: fastq_list = util.make_fastq_file_to_list(path) # divide data_list by MULTI_CNT splited_fastq_list = np.array_split(fastq_list, MULTI_CNT) print("platform.system() : ", SYSTEM_NM) print("total cpu_count : ", str(TOTAL_CPU)) print("will use : ", str(MULTI_CNT)) pool = mp.Pool(processes=MULTI_CNT) pool_list = pool.map(logic.get_brcd_umi_frequency_from_FASTQ, splited_fastq_list) result_dict, brcd_result_dict = logic_prep.merge_dict_pool_list(pool_list) res_list = logic_prep.make_dict_to_list(result_dict, brcd_result_dict) sorted_res_list = logic_prep.sort_list_by_ele(res_list, 0) header = ["barcode", "#tot_freq_barcode", "umi", "#freq_umi"] util.make_tsv(path.replace("FASTQ", "output").replace(".fastq", "_result.txt"), header, sorted_res_list)
def make_filtered_ccds_current_file_by_shortest_cdn(): print('make_filtered_ccds_current_file_by_shortest_cdn') logic_prep = LogicPrep.LogicPreps() util = Util.Utils() ccds_list = [] if SYSTEM_NM == 'Linux': ccds_list.extend( util.read_tsv_ignore_N_line(WORK_DIR + IN + NON_FLT_CDS_INFO, n_line=0)) else: # ccds_list.extend(util.read_csv_ignore_N_line(WORK_DIR + IN + NON_FLT_CDS_INFO, n_line=0)[:3000]) ccds_list.extend( util.read_tsv_ignore_N_line(WORK_DIR + IN + NON_FLT_CDS_INFO, n_line=0)) # st plan A : filter out non Public, non Identical ccds_list = logic_prep.get_data_with_trgt_strng(ccds_list, 'Public', 5) ccds_list = logic_prep.get_data_with_trgt_strng(ccds_list, 'Identical', -1) ccds_hg38_form_list = logic_prep.transform_mouse_ccds_form_to_hg38_refFlat( ccds_list) filted_ccds_list = logic_prep.get_shortest_cdn_among_same_gen_id( ccds_hg38_form_list) # 20201201 # en plan A header = [ 'GeneSym', 'NMID', 'Chrom', 'Strand', 'Transcript_Start', 'End', 'ORFStart', 'End', '#Exon', 'ExonS_list', 'ExonE_list' ] util.make_tsv(WORK_DIR + IN + 'shortest_cdn_' + FLTD_CDS_INFO, header, filted_ccds_list)
def sort_n_merge_by_chr_one_file(self, init_merge, init_be): ref_path = init_merge[0] cdf_file = init_merge[1] a_or_c_idx = init_merge[2] a_c_rule = init_merge[3] work_dir = init_merge[4] top_n = init_merge[5] f_nm = init_be[5] logic_prep = LogicPrep.LogicPreps() util = Util.Utils() trgt_seq_dict = logic_prep.get_target_seq_with_clvg_site(ref_path + cdf_file, init_be) chr_dict = logic_prep.target_seq_with_clvg_site_group_by_chromosome(trgt_seq_dict) cs9_score_dict = {} cs9_score_dict.update(logic_prep.get_deep_cas9_tupl(work_dir + "deep_cas_9/", "RANK_final_DeepCas9_0.txt", "sample_0.txt")) cs9_score_dict.update(logic_prep.get_deep_cas9_tupl(work_dir + "deep_cas_9/", "RANK_final_DeepCas9_1.txt", "sample_1.txt")) top_n_list = [] for chr_key, trnscrpt_list in chr_dict.items(): result_list = [] result_list = logic_prep.merge_cas9_abe_cbe_to_list(chr_key, [trnscrpt_list, {}, {}, cs9_score_dict], result_list) sort_by_cas9_list = logic_prep.sort_by_idx_element(result_list, -3, []) top_n_list.extend(sort_by_cas9_list[:top_n + 1]) # make tsv file result util.make_tsv_after_sorting(work_dir + "output/" + f_nm + "_seq_sorted_by_CAS9_top_" + str(top_n), top_n_list, init_be) # make excel result util.make_excel_after_sorting(work_dir + "output/" + f_nm + "_seq_sorted_by_CAS9_top_" + str(top_n), top_n_list, init_be)
def main_by_list_w_filenames(): util = Util.Utils() logic_prep = LogicPrep.LogicPreps() # file_num_list = [] # for j in range(964): # file_num_list.append(j) header = ['chr', 'tot_seq', 'fam_nm', 'index', 'strand', 'trns_flag'] # for i in file_num_list: for i in range(964): path = WORK_DIR + IN + TE_info_fl.replace( ".txt", "") + "/Genome_TandemRepeat_TRD_" + str(i) + ".txt" te_inf_list = util.read_csv_ignore_N_line(path, "\t", 0) splited_te_inf_list = np.array_split(te_inf_list, MULTI_CNT) print("platform.system() : ", SYSTEM_NM) print("total cpu_count : ", str(TOTAL_CPU)) print("will use : ", str(MULTI_CNT)) pool = mp.Pool(processes=MULTI_CNT) pool_list = pool.map(start_multi_processing, splited_te_inf_list) pool.close() splited_te_inf_list[:] = [] result_list = logic_prep.merge_multi_list(pool_list) pool_list.clear() util.make_csv(WORK_DIR + "output2/TE_trgt_20210330_" + str(i) + ".txt", header, result_list, 0, '\t') result_list.clear()
def merge_cas9_abe_cbe(): logic = Logic.Logics() logic_prep = LogicPrep.LogicPreps() util = Util.Utils() trgt_seq_dict = logic_prep.get_target_seq_with_clvg_site( REF_PATH + CDS_FILE, INIT_BE) chr_dict, aqia_chr_dict = logic_prep.target_seq_with_clvg_site_group_by_chromosome( trgt_seq_dict, ":Macaca_fascicularis_5.0:", IGNORE_CHR_LIST) a_c_dict = logic.filter_out_by_ACGTU_rule(chr_dict, A_or_C_IDX, ACTG_RULE) aqia_a_c_dict = logic.filter_out_by_ACGTU_rule(aqia_chr_dict, A_or_C_IDX, ACTG_RULE) abe_score_dict = logic_prep.get_deep_base_ed_score( WORK_DIR + "deep_ABE/ABE_Efficiency.txt") cbe_score_dict = logic_prep.get_deep_base_ed_score( WORK_DIR + "deep_CBE/CBE_Efficiency.txt") cs9_score_dict = logic_prep.get_deep_cas9_tupl( WORK_DIR + "deep_cas_9/", "RANK_final_DeepCas9_Final.txt", "sample.txt") util.make_merge_excel_by_chr( WORK_DIR + "merge_cas9_abe_cbe/crab_eating_monkey_merge_abe_cbe_cas9", [a_c_dict, abe_score_dict, cbe_score_dict, cs9_score_dict], INIT_BE) util.make_merge_excel( WORK_DIR + "merge_cas9_abe_cbe/crab_eating_monkey_merge_abe_cbe_cas9_AQIA", [aqia_a_c_dict, abe_score_dict, cbe_score_dict, cs9_score_dict], INIT_BE)
def multi_processing_1(): logic_prep = LogicPrep.LogicPreps() util = Util.Utils() # CHROM POS ID REF ALT mut_length CLNVC CLNSIG # POS - 1 = index of .fa sequence # [['1', '930188', '846933', 'G', 'A', '1', 'substitution', 'Uncertain_significance'],...] mut_list = [] if SYSTEM_NM == 'Linux': mut_list.extend(util.read_csv_ignore_N_line(WORK_DIR + "input/" + MUT_INFO, "\t")) else: mut_list.extend(util.read_csv_ignore_N_line(WORK_DIR + "input/" + MUT_INFO, "\t")[:300]) splited_mut_list = np.array_split(mut_list, MULTI_CNT) print("platform.system() : ", SYSTEM_NM) print("total cpu_count : ", str(TOTAL_CPU)) print("will use : ", str(MULTI_CNT)) pool = mp.Pool(processes=MULTI_CNT) pool_list = pool.map(get_PAM_within_N_bp_of_POS, splited_mut_list) result_list = logic_prep.merge_multi_list(pool_list) header = ['CHROM', 'PAM', str(SEQ_WIN_SIZE[0]) + ' + PAM + ' + str(SEQ_WIN_SIZE[1]), 'PAM_POS', 'STRAND'] try: os.remove(WORK_DIR + "input/" + multi_processing_1_FILE) except Exception as err: print('os.remove(WORK_DIR + "input/" + multi_processing_1_FILE) : ', str(err)) util.make_csv(WORK_DIR + "input/" + multi_processing_1_FILE, header, result_list, 0, "\t") util.make_excel(WORK_DIR + "output/ClinVar_hg38_result", header, result_list)
def main_20201117(): util = Util.Utils() logic_prep = LogicPrep.LogicPreps() logic = Logic.Logics() input_list = util.read_tsv_ignore_N_line(INPUT_LIST) result_list = [] for val_arr in input_list: ori_seq = val_arr[0].upper() n_of_mismatch = int(val_arr[1]) n_of_sub_seq = int(val_arr[2]) idx_set = logic_prep.make_seq_idx_set(0, len(ori_seq)) rand_idx_list = [] for i in range(n_of_sub_seq): rand_idx_list.append([random.sample(idx_set, n_of_mismatch)]) for idx_list in rand_idx_list: sub_seq = ori_seq for idx_arr in idx_list: for i in idx_arr: tmp_set = BASE_NT - {ori_seq[i].lower()} sub_seq = logic.swap_char_in_string( sub_seq, i, random.sample(tmp_set, 1)[0]) result_list.append([ori_seq, sub_seq, len(idx_arr)]) header = ['ori_seq', 'sub_seq', '#_of_mismatch'] try: util.make_excel(WORK_DIR + '/output/result', header, result_list) except Exception as err: util.make_tsv(WORK_DIR + '/output/result', header, result_list)
def main(): util = Util.Utils() logic_prep = LogicPrep.LogicPreps(WEB_DRV, TARGET_URL) logic = Logic.Logics() for input_file in INPUT_TXT: needle_result_list = [] input_list = util.read_tb_txt(WORK_DIR + input_file) for val_arr in input_list: final_idx = val_arr[1] asequence = val_arr[3] # NGS read bsequence = val_arr[4] # Reference logic_prep.go_to_url(TARGET_URL) logic_prep.input_data_by_id("pn_stype", "dna") logic_prep.input_data_by_id("asequence", asequence) logic_prep.input_data_by_id("bsequence", bsequence) logic_prep.scroll_down() logic_prep.get_by_xpath("//div[@id='jd_submitButtonPanel']/input[@type='submit']", False).click() logic_prep.go_to_url(WEB_DRV.current_url) logic_prep.get_by_xpath("//pre[@id='alignmentContent']", False) crwl_txt = logic_prep.get_by_xpath("//pre[@id='alignmentContent']", False).text a_seq_name, crwl_txt_arr = logic.extract_data(crwl_txt) logic.add_needle_result(final_idx, a_seq_name, crwl_txt_arr, needle_result_list) util.make_excel(WORK_DIR + "crawler_output/result_" + input_file.replace(".txt", ""), needle_result_list)
def multi_processing_for_whole_pam_ClinVar(): logic_prep = LogicPrep.LogicPreps() util = Util.Utils() # CHROM POS ID REF ALT QUAL FILTER INFO # POS - 1 = index of .fa sequence # [['1', '1338000', '208047', 'CT', 'C', '.', '.', '"ALLELEID=204306;CLNDISDB=MONDO:MONDO:0014591,MedGen:C4225363,OMIM:616331;CLNDN=Robinow_syndrome,_autosomal_dominant_2;CLNHGVS=NC_000001.11:g.1338001del;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Pathogenic;CLNVC=Deletion;CLNVCSO=SO:0000159;GENEINFO=DVL1:1855;MC=SO:0001589|frameshift_variant;ORIGIN=33;RS=797044837"'],...] mut_list = [] if SYSTEM_NM == 'Linux': mut_list.extend(util.read_csv_ignore_N_line(WORK_DIR + "input/" + MUT_INFO, "\t")) else: mut_list.extend(util.read_csv_ignore_N_line(WORK_DIR + "input/" + MUT_INFO, "\t")[:300]) splited_mut_list = np.array_split(mut_list, MULTI_CNT) print("platform.system() : ", SYSTEM_NM) print("total cpu_count : ", str(TOTAL_CPU)) print("will use : ", str(MULTI_CNT)) pool = mp.Pool(processes=MULTI_CNT) pool_list = pool.map(get_seq_by_pam_after_mut, splited_mut_list) result_list = logic_prep.merge_multi_list(pool_list) header = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO', 'P_REF_SEQ_[' + str(WIN_SIZE[0]) + '], M_REF_SEQ_[' + str(WIN_SIZE[1]) + ']'] for pam_nm in OTHOLOG: for strand in ['+', '-']: header.append(pam_nm + strand) util.make_excel(WORK_DIR + "output/SY_Dominant_result_by_spacer", header, result_list)
def multi_processing_test(): util = Util.Utils() ref_val = [ '76967', 'TTTGACTCATCTCGTCACTACAGACATGCATCGCATACTCTCCCTATGTTCCAGCTTCCTGGGTCTGCAGGTCCAGCCGAGTCGCCAAATAAGTGCCATCTACTCTACC' ] logic_prep = LogicPrep.LogicPreps([ref_val, 0, 0]) ngs_read = util.read_tb_txt_wo_header(WORK_DIR + NGS_read_DIR + ref_val[0] + ".txt") splited_ngs_read = np.array_split(ngs_read, MULTI_CNT) print("total cpu_count : " + str(TOTAL_CPU)) print("will use : " + str(MULTI_CNT)) pool = mp.Pool(processes=MULTI_CNT) pool_list = pool.map(logic_prep.get_pairwise2_needle_dict_simple, splited_ngs_read) merge_dict, _ = logic_prep.merge_multi_dict_from_simple(pool_list) result_dict = logic_prep.get_sub_ins_del_list_dict_from_simple(merge_dict) util.make_excel_simple( WORK_DIR + "output/multi_p_result_" + ref_val[0] + "_" + str(time.perf_counter()), result_dict)
def multi_processing(): util = Util.Utils() ref_seq_list = util.read_tb_txt_wo_header(WORK_DIR + REF_SEQ) for ref_val in ref_seq_list: logic_prep = LogicPrep.LogicPreps([ref_val, 0, 0]) try: ngs_read = util.read_tb_txt_wo_header(WORK_DIR + NGS_read_DIR + ref_val[0] + ".txt") splited_ngs_read = np.array_split(ngs_read, MULTI_CNT) print("total cpu_count : " + str(TOTAL_CPU)) print("will use : " + str(MULTI_CNT)) pool = mp.Pool(processes=MULTI_CNT) pool_list = pool.map(logic_prep.get_pairwise2_needle_dict_simple, splited_ngs_read) merge_dict, _ = logic_prep.merge_multi_dict_from_simple(pool_list) result_dict = logic_prep.get_sub_ins_del_list_dict_from_simple( merge_dict) util.make_excel_simple( WORK_DIR + "output/multi_p_result_" + ref_val[0], result_dict) except FileNotFoundError: print(ref_val[0] + ".txt : FileNotFoundError") continue
def anlyze_indel_by_MAIN_to_SUB(): util = Util.Utils() logic_prep = LogicPrep.LogicPreps() logic = Logic.Logics() brcd_list = util.csv_to_list_ignr_header(WORK_DIR + INPUT + BRCD_FILE) brcd_arr = logic_prep.make_arr_list_to_list(brcd_list) trgt_list = [] trgt_err_list = [] for path in [MAIN_DIR, SUB_DIR]: csv_list = util.csv_to_list_ignr_header( WORK_DIR + INPUT + path + F_TABLE_FILE, "\t") result_list, err_list = logic_prep.get_data_by_cell_id( csv_list, brcd_arr, CONST_INIT) trgt_list.append(result_list) trgt_err_list.append(err_list) # result_dict = logic.count_len_arr_mut_non_mut_by_main_list(trgt_list[0], trgt_list[1], brcd_arr) result_dict = logic.count_cell_mut_non_mut_by_main_list( trgt_list[0], trgt_list[1]) util.make_excel_indel_frequency_by_cell_id( WORK_DIR + "output/result_indel_" + MAIN_SUB_NAME[0] + "_" + MAIN_SUB_NAME[1], result_dict, MAIN_SUB_NAME) for idx in range(len(trgt_err_list)): sorted_err_list = logic_prep.sort_list_by_ele(trgt_err_list[idx], -1) logic.count_num_by_err(sorted_err_list) util.make_excel_err_list( WORK_DIR + "output/" + MAIN_SUB_NAME[idx] + "_error_list", sorted_err_list)
def main1(): util = Util.Utils() logic_prep = LogicPrep.LogicPreps() logic = Logic.Logics() seq_record = util.get_seq_record_from_genbank(WORK_DIR + NCBI + genbank_file_name + ".gb") cds_idx_list = logic_prep.get_cds_idx_arr_to_list(seq_record) init_rule = INIT pam_seq = init_rule[2] plus_strand_list, minus_strand_list = logic.get_idx_of_matching_seq( seq_record.seq, pam_seq) plus_idx_list = logic.get_idx_in_list(plus_strand_list, cds_idx_list) minus_idx_list = logic.get_idx_in_list(minus_strand_list, cds_idx_list, False) filtered_plus_idx_list = logic_prep.filter_out_dupl(plus_idx_list) filtered_minus_idx_list = logic_prep.filter_out_dupl(minus_idx_list) plus_seq_list = logic.get_trgt_seq_in_idx_list(seq_record.seq, filtered_plus_idx_list, init_rule) minus_seq_list = logic.get_trgt_seq_in_idx_list(seq_record.seq, filtered_minus_idx_list, init_rule, False) merge_list = logic_prep.merge_list([plus_seq_list, minus_seq_list]) tot_list = logic_prep.sort_list_by_ele(merge_list, 0) header = ["sequence", "strand"]
def multi_step_1(): util = Util.Utils() logic_prep = LogicPrep.LogicPreps() fl_num = 0 dfam_info = util.read_csv_ignore_N_line(DFAM_ANNO + str(fl_num), '\t', 0) if SYSTEM_NM != 'Linux': dfam_info = dfam_info[:100] dfam_dict = logic_prep.make_list_to_dict_by_ele_as_key(dfam_info, 0) header = ['chr', 'tot_seq', 'fam_nm', 'index', 'strand', 'trns_flag'] for key, val_list in dfam_dict.items(): splited_dfam_list = np.array_split(val_list, MULTI_CNT) print("platform.system() : ", SYSTEM_NM) print("total cpu_count : ", str(TOTAL_CPU)) print("will use : ", str(MULTI_CNT)) pool = mp.Pool(processes=MULTI_CNT) pool_list = pool.map(get_trgt, splited_dfam_list) result_list = logic_prep.merge_multi_list(pool_list) print(type(result_list)) util.make_excel(WORK_DIR + "output/TE_trgt_" + str(fl_num) + "_" + key, header, result_list)
def make_filtered_hg38_refFlat(): logic_prep = LogicPrep.LogicPreps() util = Util.Utils() # GeneSym NMID Chrom Strand Transcript_Start End ORFStart End #Exon ExonS_list ExonE_list # ['MIR6859-1', 'NR_106918', 'chr1', '-', '17368', '17436', '17436', '17436', '1', '17368,', '17436,'] # ['WASH7P', 'NR_024540', 'chr1', '-', '14361', '29370', '29370', '29370', '11', '14361,14969,15795,16606,16857,17232,17605,17914,18267,24737,29320,', '14829,15038,15947,16765,17055,17368,17742,18061,18366,24891,29370,'] cds_list = [] if SYSTEM_NM == 'Linux': cds_list.extend(util.read_csv_ignore_N_line(WORK_DIR + "input/" + CDS_INFO, "\t", 0)) else: cds_list.extend(util.read_csv_ignore_N_line(WORK_DIR + "input/" + CDS_INFO, "\t", 0)[:3000]) NM_cds_list = logic_prep.filter_out_NON_NM_id_in_cds_list(cds_list) # filter_out_cds_wout_strt_cdn(NM_cds_list) splited_cds_list = np.array_split(NM_cds_list, MULTI_CNT) print("platform.system() : ", SYSTEM_NM) print("total cpu_count : ", str(TOTAL_CPU)) print("will use : ", str(MULTI_CNT)) pool = mp.Pool(processes=MULTI_CNT) pool_cds_idx_list = pool.map(filter_out_cds_wout_strt_cdn, splited_cds_list) result_list = logic_prep.merge_multi_list(pool_cds_idx_list) header = ['GeneSym', 'NMID', 'Chrom', 'Strand', 'Transcript_Start', 'End', 'ORFStart', 'End', '#Exon', 'ExonS_list', 'ExonE_list'] try: os.remove(WORK_DIR + "input/" + FILTERED_CDS_INFO) except Exception as err: print('os.remove(WORK_DIR + "input/" + FILTERED_CDS_INFO) : ', str(err)) util.make_csv(WORK_DIR + "input/" + FILTERED_CDS_INFO, header, result_list, 0, "\t") util.make_excel(WORK_DIR + "output/filtered_hg38_refFlat", header, result_list)
def recount_motif_error(): util = Util.Utils() logic = Logic.Logics() logic_prep = LogicPrep.LogicPreps() for err_fl_path in MOTIF_ERROR_FL: motif_err_fl = util.read_tsv_ignore_N_line(WORK_DIR + IN + err_fl_path) # filter out missing values flted_1_motif_err_fl = logic_prep.filterout_ele_w_trgt_str( motif_err_fl, 2, '-') motif_err_fl.clear() # #NAME? is removed flted_2_motif_err_fl = logic_prep.filterout_ele_w_trgt_str( flted_1_motif_err_fl, 2, 'N') flted_1_motif_err_fl.clear() flted_3_motif_err_fl = logic_prep.filterout_ele_w_trgt_str( flted_2_motif_err_fl, 2, 'n') flted_2_motif_err_fl.clear() motif_err_dict = logic_prep.make_list_to_dict_by_elekey( flted_3_motif_err_fl, 0) result_list = logic.recount_total_proportion_by_dictkey( motif_err_dict, 3) # head = ['Filename', 'INDEX', 'seq', 'Motif', 'Count', 'Total_cnt', 'Proportion', 'Substitution'] head = [ 'Filename', 'seq', 'Motif', 'Count', 'Total_cnt', 'Proportion', 'Substitution' ] util.make_excel( WORK_DIR + OU + 'new_' + err_fl_path.replace('.txt', ''), head, result_list)
def test(): util = Util.Utils() logic_prep = LogicPrep.LogicPreps() excel_arr = [] csv_list = [[x.upper() for x in tmp_arr] for tmp_arr in util.read_csv_ignore_N_line(WORK_DIR + INPUT + GUIDE_BARCODE_CSV)] # csv_list excel_arr.append(csv_list) # index_list excel_arr.append(logic_prep.make_1_arr_list_to_list(0, csv_list)) # guide_list excel_arr.append(logic_prep.make_1_arr_list_to_list(2, csv_list)) # barcd_randBP_list excel_arr.append(logic_prep.make_2_arr_list_to_list_after_slice(6, 7, POS_SLICE_RAND_BP, csv_list)) for tmp_idx in range(len(logic_prep.make_2_arr_list_to_list(6, 7, csv_list))): print(logic_prep.make_2_arr_list_to_list(6, 7, csv_list)[tmp_idx]) print(logic_prep.make_2_arr_list_to_list_after_slice(6, 7, POS_SLICE_RAND_BP, csv_list)[tmp_idx]) # trgt_list excel_arr.append(logic_prep.make_1_arr_list_to_list(8, csv_list)) # d0_seq_wo_scaf_list excel_arr.append(logic_prep.make_3_arr_list_to_list(3, 4, 5, csv_list)) # barcd_list excel_arr.append(logic_prep.make_1_arr_list_to_list(6, csv_list)) # randBP_list excel_arr.append(logic_prep.make_1_arr_list_to_list(7, csv_list)) for d0_d4_idx in range(len(D0_D4_FLAG_ARR)): logic = Logic.Logics(INIT, excel_arr, D0_D4_FLAG_ARR[d0_d4_idx])
def make_filtered_ccds_current_file_by_shortest_cdn(): logic_prep = LogicPrep.LogicPreps() util = Util.Utils() ccds_list = [] if SYSTEM_NM == 'Linux': ccds_list.extend(util.read_csv_ignore_N_line(WORK_DIR + "input/201130_CCDS_" + TYPE + "_current.txt", "\t", 0)) else: # ccds_list.extend(util.read_csv_ignore_N_line(WORK_DIR + "input/CCDS.current.txt", "\t", 0)[:3000]) ccds_list.extend(util.read_csv_ignore_N_line(WORK_DIR + "input/201130_CCDS_" + TYPE + "_current.txt", "\t", 0)) # st plan A : filter out non Public, non Identical ccds_list = logic_prep.get_data_with_trgt_strng(ccds_list, 'Public', 5) ccds_list = logic_prep.get_data_with_trgt_strng(ccds_list, 'Identical', -1) ccds_hg38_form_list = logic_prep.transform_mouse_ccds_form_to_hg38_refFlat(ccds_list) filted_ccds_list = logic_prep.get_shortest_cdn_among_same_gen_id(ccds_hg38_form_list) # 20201201 # en plan A header = ['GeneSym', 'NMID', 'Chrom', 'Strand', 'Transcript_Start', 'End', 'ORFStart', 'End', '#Exon', 'ExonS_list', 'ExonE_list'] try: os.remove(WORK_DIR + "input/filtered_shortest_cdn_CCDS_" + TYPE + ".txt") except Exception as err: print('os.remove(WORK_DIR + "input/filtered_CCDS.current.txt") : ', str(err)) util.make_csv(WORK_DIR + "input/filtered_shortest_cdn_CCDS_" + TYPE + ".txt", header, filted_ccds_list, 0, "\t")
def make_filtered_out_ClinVar_pos_in_cds_or_not(): logic = Logic.Logics() logic_prep = LogicPrep.LogicPreps() util = Util.Utils() cds_info = util.read_csv_ignore_N_line(WORK_DIR + "input/" + ALL_CDS_INFO, "\t") cds_dict_by_chr = {} for cds_arr in cds_info: chrom = cds_arr[2] start_idx_arr, end_idx_arr = logic_prep.get_orf_strt_end_idx_arr(cds_arr) idx_list = logic_prep.get_idx_num_frm_strt_to_end_list(start_idx_arr, end_idx_arr) if chrom in cds_dict_by_chr: cds_dict_by_chr[chrom].append(idx_list) else: cds_dict_by_chr.update({chrom: [idx_list]}) mut_dict = logic_prep.get_dict_from_list_by_ele_key( util.read_csv_ignore_N_line(WORK_DIR + "input/" + MUT_INFO, "\t"), 0) not_in_cds_list = [] in_cds_list = [] for chr_num, mut_list in mut_dict.items(): cds_idx_list = cds_dict_by_chr['chr' + chr_num] for mut_arr in mut_list: pos = int(mut_arr[1]) + ADJ_REF_IDX tmp_id = int(mut_arr[2]) if not logic.check_seq_in_cds(cds_idx_list, pos): not_in_cds_list.append(mut_arr) else: in_cds_list.append(mut_arr) print(len(not_in_cds_list)) header = ['#CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO'] util.make_csv(WORK_DIR + '/input/ClinVar_dominant_mutation_on_CDS.txt', header, in_cds_list, deli='\t') util.make_csv(WORK_DIR + '/input/ClinVar_dominant_mutation_not_on_CDS.txt', header, not_in_cds_list, deli='\t')
def indel_frequency_by_1500x1500_cell_id_w_output_dir(): util = Util.Utils() logic_prep = LogicPrep.LogicPreps() logic = Logic.Logics() brcd_list = util.csv_to_list_ignr_header(WORK_DIR + INPUT + BRCD_FILE) brcd_arr = logic_prep.make_arr_list_to_list(brcd_list) cell_id_list = logic_prep.make_cell_id(brcd_arr, "^") var_list = util.csv_to_list_ignr_header(WORK_DIR + INPUT + "var_list.txt", "\t") for idx in range(int(len(var_list) / 2)): main_idx = 2 * idx sub_idx = 2 * idx + 1 main_arr = var_list[main_idx] sub_arr = var_list[sub_idx] main_sub_nm = [main_arr[0], sub_arr[0]] main_path = main_arr[5] + "/CRISPResso_on_" + main_arr[1].replace( ".fastq", "") + "_join" sub_path = sub_arr[5] + "/CRISPResso_on_" + sub_arr[1].replace( ".fastq", "") + "_join" path_arr = [main_path, sub_path] trgt_list = [] trgt_err_list = [] for path in path_arr: csv_list = util.csv_to_list_ignr_header( WORK_DIR + INPUT + SUBPATH + path + F_TABLE_FILE, "\t") tmp_list, err_list = logic_prep.get_data_by_cell_id( csv_list, brcd_arr, CONST_INIT) trgt_list.append(tmp_list) trgt_err_list.append(err_list) result_list, cnt_hom_hete_wt, junk_arr = logic.get_num_of_reads_percent_of_read_by_cell( trgt_list, cell_id_list, THRESHOLD_ARR) # make output path os.makedirs(WORK_DIR + 'output/' + SUBPATH, exist_ok=True) util.make_excel_by_list( WORK_DIR + "output/" + SUBPATH + "tot_read_by_cell_homo_hetero_" + main_sub_nm[0] + "_" + main_sub_nm[1] + "_" + str(idx), result_list, cnt_hom_hete_wt) for tmp_idx in range(len(trgt_err_list)): sorted_err_list = logic_prep.sort_list_by_ele( trgt_err_list[tmp_idx], -1) logic.count_num_by_err(sorted_err_list) util.make_excel_err_list( WORK_DIR + "output/" + SUBPATH + main_sub_nm[tmp_idx] + "_error_list_" + str(idx), sorted_err_list) junk_file_nm = ['cell_non_junk', 'non_cell_junk'] for idx_junk in range(len(junk_arr)): util.make_excel_by_arr_list( WORK_DIR + "output/" + SUBPATH + junk_file_nm[idx_junk] + "_" + main_sub_nm[0] + "_" + main_sub_nm[1] + "_" + str(idx), junk_arr[idx_junk])
def split_TE_1_fl_n_by_1_right_away(): util = Util.Utils() logic_prep = LogicPrep.LogicPreps() header = ['sequence', '#duple', '#trnscprt', 'chromosome:23bp(spacer + PAM)index range:strand:transcription:fam_name'] fl_nm_f = WORK_DIR + "output/loop/TE_trgt_cyc" cyc_num = 3 fl_nm_b = "_fln_by_1" res_f_num = 0 # tm_arr = [[1, 2, 3, 5], [0, 4, 6, 7]] # tm_arr = [[0, 3], [1, 2]] tm_arr = [[0, 2]] for i_a in range(len(tm_arr)): result_dict = {} for i in tm_arr[i_a]: print(fl_nm_f + str(cyc_num) + "_" + str(i) + fl_nm_b) with open(fl_nm_f + str(cyc_num) + "_" + str(i) + fl_nm_b) as f: print(f.readline()) while True: tmp_line = f.readline().replace("\n", "") if tmp_line == '': break dfam_arr = tmp_line.split('\t') tot_seq = dfam_arr[0] res_key = tot_seq if res_key in result_dict: result_dict[res_key].update(dfam_arr[3].replace(" ", "").split(',')[:-1]) else: tmp_set = set(dfam_arr[3].replace(" ", "").split(',')[:-1]) result_dict.update({res_key: tmp_set}) result0_list = [] result1_list = [] for res_key, val_set in result_dict.items(): tmp_str = "" cnt_trpt = 0 for tmp_val in val_set: if 'True' in tmp_val: cnt_trpt += 1 tmp_str += tmp_val + ", " if len(val_set) > 1: result0_list.append([res_key, len(val_set), cnt_trpt, tmp_str]) else: result1_list.append([res_key, len(val_set), cnt_trpt, tmp_str]) result_dict.clear() sorted_result0_list = logic_prep.sort_list_by_ele(result0_list, 1) result0_list.clear() util.make_csv(fl_nm_f + str(cyc_num + 1) + "_" + str(res_f_num) + "_fln_by_1", header, sorted_result0_list, 0, '\t') res_f_num += 1 util.make_csv(fl_nm_f + str(cyc_num + 1) + "_" + str(res_f_num) + "_fln_by_1", header, result1_list, 0, '\t') res_f_num += 1 sorted_result0_list.clear() result1_list.clear()
def multi_processing_split_big_files_then_find_seq_from_FASTQ(): print('multi_processing_split_big_files_then_find_seq_from_FASTQ') util = Util.Utils() logic_prep = LogicPrep.LogicPreps() brcd_list = util.read_tb_txt(WORK_DIR + BARCD_SEQ_FILE) logic = Logic.Logics(brcd_list) # fastq file name without ext big_fastq_fl_nm_list = ["19k_ramu", "19k_my"] fastq_ext = '.fastq' for fastq_fl_nm in big_fastq_fl_nm_list: # split big file split_init = { 'big_file_path': WORK_DIR + FASTQ + fastq_fl_nm + fastq_ext, 'num_row': 4000000, 'splited_files_dir': WORK_DIR + FASTQ + fastq_fl_nm + "/", 'output_file_nm': fastq_fl_nm, 'output_file_ext': fastq_ext } util.split_big_file_by_row(split_init) # get splited_files path sources = util.get_files_from_dir(split_init['splited_files_dir'] + '*.fastq') result_dict = {} for splited_fastq_fl in sources: print("get_FASTQ_seq_list :", splited_fastq_fl) fastq_list = util.get_FASTQ_seq_list(splited_fastq_fl) # divide data_list by MULTI_CNT splited_fastq_list = np.array_split(fastq_list, MULTI_CNT) fastq_list.clear() print("platform.system() : ", SYSTEM_NM) print("total cpu_count : ", str(TOTAL_CPU)) print("will use : ", str(MULTI_CNT)) pool = mp.Pool(processes=MULTI_CNT) ## analyze FASTQ seq after barcode seq pool_list = pool.map(logic.get_dict_multi_p_seq_from_FASTQ, splited_fastq_list) ## analyze whole FASTQ seq # pool_list = pool.map(logic.get_dict_multi_p_seq_from_whole_FASTQ, splited_fastq_list) print("merge_pool_list_to_result_dict") logic.merge_pool_list_to_result_dict(pool_list, result_dict) pool.close() pool_list[:] = [] logic_prep.add_missing_brcd_to_dict(brcd_list, result_dict) print("make excel file") util.make_dict_to_excel( WORK_DIR + "output/result_" + fastq_fl_nm + "_" + BARCD_SEQ_FILE.replace("barcode_seq/", "").replace(".txt", ""), result_dict) result_dict.clear()
def get_seq_by_pam_after_mut(self, path, mut_list, win_arr, init): logic_prep = LogicPrep.LogicPreps() pam_arr = init[1] len_f_pam_arr = init[2] len_b_pam_arr = init[3] adj_ref_idx = init[4] for mut_arr in mut_list: chr_num = mut_arr[0] pos = int(mut_arr[1]) + adj_ref_idx ref_p_seq = mut_arr[3] alt_p_seq = mut_arr[4] ref_m_seq = "" alt_m_seq = "" try: ref_m_seq += self.make_complement_string(ref_p_seq) if alt_p_seq == '.': alt_p_seq = "" else: alt_m_seq += self.make_complement_string(alt_p_seq) except Exception as err: print("make_complement_string ::: ", err) print(ref_p_seq, " : ref_p_seq") print(alt_p_seq, " : alt_p_seq") print(str(mut_arr)) seq_record = SeqIO.read(path + "chr" + chr_num + ".fa", "fasta") p_seq = str(seq_record.seq).upper() m_seq = str(seq_record.seq.complement()).upper() ori_win_flag = True for idx in range(len(pam_arr)): pam = pam_arr[idx] len_f_pam = len_f_pam_arr[idx] len_b_pam = len_b_pam_arr[idx] ref_p_dict, p_ori_win_seq = self.get_matched_pam_p_seq_dict( p_seq, pos, win_arr, ref_p_seq, pam, len_f_pam, len_b_pam) ref_m_dict, m_ori_win_seq = self.get_matched_pam_m_seq_dict( m_seq, pos, win_arr, ref_m_seq, pam, len_f_pam, len_b_pam) mut_p_dict, _ = self.get_matched_pam_p_seq_dict( p_seq, pos, win_arr, alt_p_seq, pam, len_f_pam, len_b_pam) mut_m_dict, _ = self.get_matched_pam_m_seq_dict( m_seq, pos, win_arr, alt_m_seq, pam, len_f_pam, len_b_pam) self.remove_dict_val_by_key(mut_p_dict, ref_p_dict.keys()) self.remove_dict_val_by_key(mut_m_dict, ref_m_dict.keys()) if ori_win_flag: mut_arr.append(p_ori_win_seq + " , " + m_ori_win_seq) ori_win_flag = False logic_prep.add_result_seq_to_arr(mut_arr, mut_p_dict) logic_prep.add_result_seq_to_arr(mut_arr, mut_m_dict)
def get_GRCh38_Regulatory_Build_regulatory_features_by_ClinVar_dominant_mutation_not_on_CDS(): clin_var_not_cds_fl_nm = 'ClinVar_dominant_mutation_not_on_CDS.txt' GRCh38_features_fl_nm = 'homo_sapiens.GRCh38.Regulatory_Build.regulatory_features.20190329.gff' logic = Logic.Logics() logic_prep = LogicPrep.LogicPreps() util = Util.Utils() clin_var_fl = util.read_csv_ignore_N_line(WORK_DIR + IN + clin_var_not_cds_fl_nm, '\t') GRCh38_features_fl = util.read_csv_ignore_N_line(WORK_DIR + IN + GRCh38_features_fl_nm, '\t', n_line=0) GRCh38_features_dict = logic_prep.get_dict_from_list_by_ele_key(GRCh38_features_fl, 0) result_dict = {} no_chr_key_list = [] for clin_var_arr in clin_var_fl: tmp_clin_var_key = tuple(clin_var_arr[:5]) chr_key = clin_var_arr[0] pos = int(clin_var_arr[1]) if chr_key in GRCh38_features_dict: result_dict.update({tmp_clin_var_key: []}) GRCh38_features_list = GRCh38_features_dict[chr_key] for GRCh38_features_arr in GRCh38_features_list: tmp_type = GRCh38_features_arr[2] tmp_info = GRCh38_features_arr[-1] st_idx = int(GRCh38_features_arr[3]) en_idx = int(GRCh38_features_arr[4]) if st_idx < pos < en_idx: result_dict[tmp_clin_var_key].append([tmp_type, tmp_info]) else: no_chr_key_list.append(tmp_clin_var_key) with open(WORK_DIR + OU + 'in_cds.txt', 'w') as in_cds_f: with open(WORK_DIR + OU + 'not_in_cds.txt', 'w') as not_cds_f: for f_key, val_list in result_dict.items(): tmp_str = "" tmp_blnk = "" for tmp_f in f_key: tmp_str = tmp_str + tmp_f + '\t' tmp_blnk = tmp_blnk + '-' + '\t' if len(val_list) == 0: not_cds_f.write(tmp_str[:-1] + '\n') else: for idx in range(len(val_list)): add_str = "" for tm_f in val_list[idx]: add_str = add_str + tm_f + '\t' if idx == 0: in_cds_f.write(tmp_str + add_str[:-1] + '\n') else: in_cds_f.write(tmp_blnk + add_str[:-1] + '\n') util.make_csv(WORK_DIR + OU + 'no_chr_key_list.txt', [], no_chr_key_list)
def make_deep_cas9_input(): logic_prep = LogicPrep.LogicPreps() util = Util.Utils() trgt_seq_dict = logic_prep.get_target_seq_with_clvg_site_fr_fasta( REF_PATH + CDS_FILE, INIT_BE) chr_dict = logic_prep.target_seq_with_clvg_site_group_by_chromosome( trgt_seq_dict) util.make_deep_cas9_input(WORK_DIR + "deep_cas_9/sample", [chr_dict], INIT_BE, BATCH_SIZE)
def merge_multi_processing_4seq_excel_result(): util = Util.Utils() logic_prep = LogicPrep.LogicPreps() txt_sources = util.get_files_from_dir(WORK_DIR + "output/" + SUB_OUT_DIR + '*.txt') total_list = [] for txt_file in txt_sources: total_list.extend(util.read_tb_txt(txt_file)) merge_dict = logic_prep.make_4seq_list_to_dict(total_list) util.make_4seq_dict_to_excel(WORK_DIR + "output/" + SUB_OUT_DIR + "merge_result_count", merge_dict)
def filter_out_cds_wout_strt_cdn(cds_list): util = Util.Utils() logic_prep = LogicPrep.LogicPreps() logic = Logic.Logics() print("start filter_out_cds_wout_strt_cdn!!!!") result_list = [] for cds_arr in cds_list: gene_sym = cds_arr[0] nm_id = cds_arr[1] chr_nm = cds_arr[2] strand = cds_arr[3] orf_strt_pos = int(cds_arr[6]) orf_end_pos = int(cds_arr[7]) p_seq, m_seq = util.read_file_by_biopython(REF_DIR + chr_nm + ".fa", "fasta") if strand == '+': strt_codon = p_seq[orf_strt_pos: orf_strt_pos + 3] if strt_codon in STRT_CD_ARR: end_codon = p_seq[orf_end_pos - 3: orf_end_pos] if end_codon in END_CD_ARR: start_idx_arr, end_idx_arr = logic_prep.get_orf_strt_end_idx_arr(cds_arr) p_cds_seq = logic_prep.get_seq_by_idx_arr(p_seq, start_idx_arr, end_idx_arr) if len(p_cds_seq) % 3 != 0: continue if logic.exist_another_orf_end_codon_in_cds_seq(p_cds_seq): continue tmp_arr = [] tmp_arr.extend(cds_arr) result_list.append(tmp_arr) else: strt_codon = m_seq[orf_end_pos - 3: orf_end_pos][::-1] if strt_codon in STRT_CD_ARR: end_codon = m_seq[orf_strt_pos: orf_strt_pos + 3][::-1] if end_codon in END_CD_ARR: start_idx_arr, end_idx_arr = logic_prep.get_orf_strt_end_idx_arr(cds_arr) m_cds_seq = logic_prep.get_seq_by_idx_arr(m_seq, start_idx_arr, end_idx_arr) if len(m_cds_seq) % 3 != 0: continue if logic.exist_another_orf_end_codon_in_cds_seq(m_cds_seq, False): continue tmp_arr = [] tmp_arr.extend(cds_arr) result_list.append(tmp_arr) print("DONE filter_out_cds_wout_strt_cdn!!!!") return result_list
def main_YG(): util = Util.Utils() logic_prep = LogicPrep.LogicPreps() logic = Logic.Logics() mut_sum_dict = util.read_txt_dvd_by_tab(WORK_DIR + MUT_FILE) sorted_mut_dict = logic_prep.sort_dict(mut_sum_dict) result_dict = logic.get_seqs_bfr_aft_trgt_idx(sorted_mut_dict, INITIAL_MAIN) util.make_excel(WORK_DIR + "analyze_hg19", sorted_mut_dict, result_dict)
def main_20201127(): util = Util.Utils() logic_prep = LogicPrep.LogicPreps() logic = Logic.Logics() df = util.read_excel_to_df(WORK_DIR + IN + IN_EXCEL, SHEET_NAME) len_df = len(df[df.columns[0]]) result_list = [] for i in range(len_df): cnt = 0 ori_seq = df.loc[i][0] n_of_mismatch = int(df.loc[i][1]) n_of_sub_seq = int(df.loc[i][2]) bp_f = ori_seq[:ORI_SEQ_STRCTR[0]] bf_spacer_n_fr_ngg = ori_seq[ORI_SEQ_STRCTR[0]:ORI_SEQ_STRCTR[1]] gg_fr_ngg = ori_seq[ORI_SEQ_STRCTR[1]:ORI_SEQ_STRCTR[2]] bf_rtt_only = ori_seq[ORI_SEQ_STRCTR[2]:ORI_SEQ_STRCTR[3]] bp_b = ori_seq[ORI_SEQ_STRCTR[3]:] bf_mm_seq = bf_spacer_n_fr_ngg + bf_rtt_only idx_set = logic_prep.make_seq_idx_set(0, len(bf_mm_seq)) mm_seq_set = set() while True: if len(mm_seq_set) >= n_of_sub_seq: break mm_idx_list = random.sample(idx_set, n_of_mismatch) af_mm_seq = bf_mm_seq for j in mm_idx_list: tmp_set = BASE_NT - {bf_mm_seq[j].lower()} af_mm_seq = logic.swap_char_in_string( af_mm_seq, j, random.sample(tmp_set, 1)[0]) mm_seq_set.add(af_mm_seq) af_mm_seq_list = list(mm_seq_set) for tmp_seq in af_mm_seq_list[:n_of_sub_seq]: if cnt == 0: result_list.append([ ori_seq, bp_f + tmp_seq[:len(bf_spacer_n_fr_ngg)] + gg_fr_ngg + tmp_seq[len(bf_spacer_n_fr_ngg):] + bp_b, n_of_mismatch ]) else: result_list.append([ '', bp_f + tmp_seq[:len(bf_spacer_n_fr_ngg)] + gg_fr_ngg + tmp_seq[len(bf_spacer_n_fr_ngg):] + bp_b, n_of_mismatch ]) cnt += 1 util.make_excel(WORK_DIR + OU + SHEET_NAME + '_result', ['ori_seq', 'sub_seq', '#_of_mismatch'], result_list)
def sort_n_merge_by_chr(self, init_merge, init_be): ref_path = init_merge[0] cdf_file = init_merge[1] a_or_c_idx = init_merge[2] a_c_rule = init_merge[3] work_dir = init_merge[4] top_n = init_merge[5] logic_prep = LogicPrep.LogicPreps() util = Util.Utils() trgt_seq_dict = logic_prep.get_target_seq_with_clvg_site( ref_path + cdf_file, init_be) chr_dict = logic_prep.target_seq_with_clvg_site_group_by_chromosome( trgt_seq_dict, "primary_assembly:ASM275486v1:") a_c_dict = self.filter_out_by_ACGTU_rule(chr_dict, a_or_c_idx, a_c_rule) abe_score_dict = logic_prep.get_deep_base_ed_score( work_dir + "deep_ABE/ABE_Efficiency.txt") cbe_score_dict = logic_prep.get_deep_base_ed_score( work_dir + "deep_CBE/CBE_Efficiency.txt") cs9_score_dict = logic_prep.get_deep_cas9_tupl( work_dir + "deep_cas_9/", "RANK_final_DeepCas9_Final.txt", "sample.txt") top_n_abe_list = [] top_n_cbe_list = [] for chr_key, trnscrpt_list in a_c_dict.items(): result_list = [] result_list = logic_prep.merge_cas9_abe_cbe_to_list( chr_key, [ trnscrpt_list, abe_score_dict, cbe_score_dict, cs9_score_dict ], result_list) sort_by_abe_list = logic_prep.sort_by_idx_element( result_list, -2, []) sort_by_cbe_list = logic_prep.sort_by_idx_element( result_list, -1, []) """ # extend TOP N lists to (top_n_abe_list, top_n_cbe_list) it needs filter out same context seq in different trnscrpt """ top_n_abe_list.extend(sort_by_abe_list[:top_n]) top_n_cbe_list.extend(sort_by_cbe_list[:top_n]) util.make_excel_after_sorting( work_dir + "merge_cas9_abe_cbe_top_N/merge_by_ABE_top_" + str(top_n), top_n_abe_list, init_be) util.make_excel_after_sorting( work_dir + "merge_cas9_abe_cbe_top_N/merge_by_CBE_top_" + str(top_n), top_n_cbe_list, init_be)