handle_expsum = open(exp_summary) expsum_content = handle_expsum.read() # download bease on the existing fasta first_comp, second_comp, pfm_comp1, pfm_comp2 = find_component( fasta_exprun, database_content, expsum_content) # ERR[0-9]*?.*(?=ALX4_ALX4) exp_entries = re.findall('ERR[0-9]*?.*' + first_comp + '_' + second_comp, expsum_content) for exp_entry in exp_entries: temp_exp = re.findall('ERR[0-9]*', exp_entry)[0] #check is the original selex data was there #check if there is some redundancy, if not generate ref_dict if os.path.isfile('Motif/' + temp_exp + '.fa'): continue else: if os.path.isfile(fasta_dir + temp_exp + '.fa'): pass else: call(['bash', 'sra_dl_single.sh', temp_exp]) print(temp_exp) handle_fasta = open(fasta_dir + temp_exp + '.fa') fasta_dict = extract_motif.fasta_parser(handle_fasta) temp_dict = parse_pfm_dict(temp_exp, fasta_dir) extract_motif.output_pfm_dict(temp_dict, temp_exp + '.fa', 'Motif/') handle_fasta.close() call(['rm', fasta_dir + temp_exp + '.fa']) handle_expsum.close() handle_database.close()
def parse_pfm_dict(fasta_exprun,fasta_dir): # fasta_exprun = sys.argv[1] # fasta_dir = sys.argv[2] # output = mp.Queue() l_kmer = 6 JASPAR_database = "pfm_vertebrates.txt" handle_database = open(JASPAR_database) database_content = handle_database.read() exp_summary = "ERP008935_info.csv" handle_expsum = open(exp_summary) expsum_content = handle_expsum.read() handle_fasta=open(fasta_dir+fasta_exprun + '.fa') fasta_dict = extract_motif.fasta_parser(handle_fasta) first_comp,second_comp,pfm_comp1,pfm_comp2 = find_component(fasta_exprun,database_content,expsum_content) if len(pfm_comp1)>0 and len(pfm_comp2)>0: print(first_comp,second_comp) else: exit() new_pfm_dict = dict() ref_pwm_short = extract_motif.tf_proc(database_content,[first_comp,second_comp],l_kmer) new_pfm =np.zeros(np.shape(ref_pwm_short[first_comp])) count_run = 0 print(len(fasta_dict)) for read_nmer in fasta_dict: count_run = count_run + 1 if count_run %10000==0: print(count_run) category_name= '' spacing=0 # kmers = [ fasta_dict[read_nmer][n:n+l_kmer] for n in range(0,len(fasta_dict[read_nmer])-l_kmer+1)] best_kmer = '' best_score = 0 # tic() fbest_kmer_index,fbest_kmer_orient,fbest_kmer,fbest_score = find_best_kmer(ref_pwm_short[first_comp],fasta_dict[read_nmer],l_kmer) rbest_kmer_index,rbest_kmer_orient,rbest_kmer,rbest_score = find_best_kmer(ref_pwm_short[second_comp],fasta_dict[read_nmer],l_kmer) # toc() # f_pwm = np.matrix(ref_pwm_short[first_comp],copy = False) # s_pwm = np.matrix(ref_pwm_short[second_comp],copy = False) # print(f_pwm) # tic() # fbest_kmer_index,fbest_kmer_orient = find_best_kmer_mp(f_pwm,fasta_dict[read_nmer],l_kmer) # rbest_kmer_index,rbest_kmer_orient = find_best_kmer_mp(s_pwm,fasta_dict[read_nmer],l_kmer) # toc() if abs(fbest_kmer_index-rbest_kmer_index)>5+l_kmer or abs(fbest_kmer_index-rbest_kmer_index)<3: continue # else: # print(fbest_kmer_orient,rbest_kmer_orient,fbest_kmer_index,rbest_kmer_index) if fbest_kmer_orient == 1 and rbest_kmer_orient == 1: # if abs(fbest_kmer_orient*fbest_kmer_index-rbest_kmer_orient*rbest_kmer_index)>5+l_kmer: # continue if fbest_kmer_index < rbest_kmer_index: spacing = rbest_kmer_index-fbest_kmer_index-l_kmer category_name = first_comp+'::'+second_comp+'_'+'1+2+'+ '_' + str(spacing) motif = fasta_dict[read_nmer][fbest_kmer_index:rbest_kmer_index+l_kmer] else: spacing = fbest_kmer_index-rbest_kmer_index-l_kmer category_name = first_comp+'::'+second_comp+'_''2+1+'+ '_' + str(spacing) motif = fasta_dict[read_nmer][rbest_kmer_index:fbest_kmer_index+l_kmer] if fbest_kmer_orient == -1 and rbest_kmer_orient == -1: # if abs(fbest_kmer_orient*fbest_kmer_index-rbest_kmer_orient*rbest_kmer_index)>5+l_kmer: # continue if fbest_kmer_index < rbest_kmer_index: spacing = rbest_kmer_index-fbest_kmer_index-l_kmer category_name = first_comp+'::'+second_comp+'_'+'2+1+'+ '_' + str(spacing) motif = reverse_comp(fasta_dict[read_nmer][fbest_kmer_index:rbest_kmer_index+l_kmer]) else: spacing = fbest_kmer_index-rbest_kmer_index-l_kmer category_name = first_comp+'::'+second_comp+'_'+'1+2+'+ '_' + str(spacing) motif = reverse_comp(fasta_dict[read_nmer][rbest_kmer_index:fbest_kmer_index+l_kmer]) if fbest_kmer_orient == 1 and rbest_kmer_orient == -1: # if abs(fbest_kmer_index-rbest_kmer_index)<3: # continue if fbest_kmer_index < rbest_kmer_index: spacing = rbest_kmer_index-fbest_kmer_index-l_kmer category_name = first_comp+'::'+second_comp+'_'+'1+2-'+ '_' + str(spacing) motif = fasta_dict[read_nmer][fbest_kmer_index:rbest_kmer_index+l_kmer] else: spacing = fbest_kmer_index-rbest_kmer_index-l_kmer category_name = first_comp+'::'+second_comp+'_'+'2-1+'+ '_' + str(spacing) motif = fasta_dict[read_nmer][rbest_kmer_index:fbest_kmer_index+l_kmer] if fbest_kmer_orient == -1 and rbest_kmer_orient == 1: # if abs(fbest_kmer_index-rbest_kmer_index)<3: # continue if fbest_kmer_index > rbest_kmer_index: spacing = fbest_kmer_index-rbest_kmer_index-l_kmer category_name = first_comp+'::'+second_comp+'_'+'1+2-' + '_' + str(spacing) motif = reverse_comp(fasta_dict[read_nmer][rbest_kmer_index:fbest_kmer_index+l_kmer]) else: spacing = rbest_kmer_index-fbest_kmer_index-l_kmer category_name = first_comp+'::'+second_comp+'_'+'2-1+' + '_' + str(spacing) motif = reverse_comp(fasta_dict[read_nmer][fbest_kmer_index:rbest_kmer_index+l_kmer]) # print(motif,category_name) # print(category_name,spacing) # new_pfm = extract_motif.pfm_writer(new_pfm,best_kmer) # print(best_score) new_pfm_dict = pfm_dict_writer(new_pfm_dict, category_name, motif) # print(ref_pwm_short[first_comp]) print(new_pfm_dict) return new_pfm_dict
temp_item = item newlist = newlist + [item] return newlist if __name__ == "__main__": main_tf = sys.argv[1] fasta_handle = open(sys.argv[3]) motif_sets_dir = sys.argv[2] JASPAR_database = "pfm_vertebrates.txt" handle_database = open(JASPAR_database) database_content = handle_database.read() ref_pfm = extract_motif.tf_proc_parse_s(database_content, main_tf, 6) chip_data = extract_motif.fasta_parser(fasta_handle) file_list = listdir_nohidden(motif_sets_dir) pfm_dict = dict() for file_entry in file_list: file_entry_handle = open(motif_sets_dir + file_entry) file_entry_content = file_entry_handle.read() pfm_dict.update(extract_motif.tf_proc_all(file_entry_content)) conc_chip_data = dict() broadpeaks = [] # print(ref_pfm[main_tf],main_tf) for chip_entry in chip_data: broadpeaks = broadpeaks + extract_spacing_pfm.find_best_kmer_chip( ref_pfm[main_tf], chip_data[chip_entry])