def extract_para(l_trees): global set_filter set_filter = pickle.load(open(out_dir / 's_filter.pic', 'rb')) global path_tmp_para path_tmp_para = utils.create_out_dir('./dir_step3/tmp_para') # extract para from tree files print(' extract para from trees') pool = Pool(nb_threads) tmp_res = pool.map_async(extract_para_from_trees, l_trees, chunksize=1) pool.close() pool.join() # combine results list para d_para = utils.get_pickle(out_dir / 'd_ortho.pic') d_para = {x:0 for x in d_para} for filename in l_trees: # load pick file with list para content_pickle = pickle.load(open(path_tmp_para / filename, 'rb')) for pair in content_pickle: try: d_para[pair] += 1 except: pass # save it to file utils.save_pickle(out_dir / 'd_para.pic', d_para) # free memory set_filter = None shutil.rmtree(path_tmp_para) shutil.rmtree(path_tmp)
def extract_ortho(l_trees): print(' extract ortho from similarity') # load pickle files tmp_d = utils.get_multi_pickle(Path('dir_step2') / 'dict_similarity_ortho', '_similarity_ortho.pic') # extract ortho d_ortho = collections.defaultdict(int) for l in tmp_d.values(): for sub in itertools.combinations(l, 2): pair_int = int(str(len(sub[0])) + sub[0] + sub[1]) d_ortho[pair_int] += 1 global path_tmp, path_tmp_ortho path_tmp = utils.create_out_dir('./dir_step3/tmp') path_tmp_ortho = utils.create_out_dir('./dir_step3/tmp_ortho') # extract ortho from tree files print(' extract ortho from trees') pool = Pool(nb_threads) tmp_res = pool.map_async(extract_ortho_from_trees, l_trees, chunksize=1) pool.close() pool.join() # unpack ortho and save them for filename in l_trees: content_pickle = pickle.load(open(path_tmp_ortho / filename, 'rb')) for pair in content_pickle: d_ortho[pair] += 1 # free memory content_pickle = None shutil.rmtree(path_tmp_ortho) # remove ortho found only once print(' remove ortho found only once') d_ortho = {k:v for k,v in d_ortho.items() if v > 1} # save it to file utils.save_pickle(out_dir / 'd_ortho.pic', d_ortho) # save a simplified version (without 2 first digits) as set to file s_filter = set() for k in d_ortho: k2 = str(k) s_filter.add( int(k2[2:]) ) utils.save_pickle(out_dir / 's_filter.pic', s_filter)
def step1_kmer_clustering(dir, ext, lk, ma, nt): # convert the parameters to global variables (horrible hack) global directory, extension, length_kmer, min_aa, nb_threads directory, extension, length_kmer, min_aa, nb_threads = Path(dir), ext, lk, ma, nt print('\n --- STEP 1: kmer clustering\n') print(' # parameters') print(' input dir : ' + str(directory)) print(' kmer size : ' + str(length_kmer)) print(' kmer nb aa : ' + str(min_aa)) ## create output directory (delete it first if already exists) global out_dir out_dir = utils.create_out_dir('dir_step1') ## check directory and files print('\n # check input files') global dict_files, list_files, list_start dict_files, list_files, list_start = pre_checking(directory, extension) ## analyse each fasta file (multithreading) print ('\n # kmer clustering\n ' + str(len(list_files)) + ' proteomes on ' + str(nb_threads) + ' threads') pool = ThreadPool(nb_threads) tmp_res = pool.map_async(process_file, list_files, chunksize=1) results_2 = tmp_res.get() pool.close() pool.join() ## create log files log_file = open(out_dir / 'log_step1.txt', 'w+') log_file.write('#index file_name nb_initial nb_final\n') ## save log file and combine other info combined = dict() names = dict() nb_final = 0 for l in results_2: log_file.write(' '.join(l[:4]) + '\n') names.update(l[4]) combined.update(l[5]) nb_final += int(l[3]) ## save pickle files utils.save_pickle(out_dir / 'combined_names.pic', combined) utils.save_pickle(out_dir / 'original_names.pic', names) utils.save_pickle(out_dir / 'species_index.pic', dict_files) print(' -> ' + str(nb_final) + ' proteins saved for the next step') print ('')
def step2_phylomes(eval, msp, pdia, pfas, tt, pm, nt): # convert the parameters to global variables (horrible hack) global evalue, max_per_species, path_diamond, path_fasttree, trim_thres, phylo_method, nb_threads evalue, max_per_species, path_diamond, path_fasttree, trim_thres, phylo_method, nb_threads = eval, msp, pdia, pfas, tt, pm, nt print('\n --- STEP 2: phylomes\n') print(' # parameters') print(' e_value : ' + str(evalue)) print(' nb_hits : ' + str(max_per_species)) print(' gaps : ' + str(trim_thres)) print(' phylogenies : ' + phylo_method.replace('nj','neighbor joining').replace('me','minimum evolution').replace('ml','maximum likelihood')) print(' threads : ' + str(nb_threads)) ## create output directory (or empty it if it already exists) global out_dir out_dir = utils.create_out_dir('dir_step2') Path(out_dir / 'dict_trees').mkdir() Path(out_dir / 'dict_output').mkdir() Path(out_dir / 'dict_similarity_ortho').mkdir() ## check directory input data global list_files print('\n # check input files') list_files = pre_checking_data(Path('dir_step1')) ## load all sequences global name_2_sp_phylip_seq, all_species name_2_sp_phylip_seq, all_species = create_dict_seq(list_files) ## create databases global db_dir db_dir = out_dir / 'databases' db_dir.mkdir(parents=True, exist_ok=True) multithread_databases(list_files, nb_threads) ## process each proteome print('\n # build phylomes ... be patient') multithread_process_file(list_files, nb_threads) ## save prot 2 species dict (needed for steps 3 and 4) save_prot_2_sp(name_2_sp_phylip_seq) # delete databases directory shutil.rmtree(db_dir) print(' done\n')
def step4_orthologous_pairs(lo, nsp, nt): # convert the parameters to global variables (horrible hack) global limit_ortho, not_same_sp, nb_threads limit_ortho, not_same_sp, nb_threads = lo, nsp, nt print('\n --- STEP 4: orthologous pairs\n') print(' ## parameters') print(' ratio ortho : ' + str(limit_ortho)) print(' not same sp : ' + str(not_same_sp)) print(' threads : ' + str(nb_threads)) print('\n ## load data') ## create output directory (delete it first if already exists) global out_dir out_dir = utils.create_out_dir('dir_step4') ## check directory files_blast_list, files_tree = pre_checking(Path('dir_step2')) ## get original and combined names original_name = utils.get_pickle(Path('dir_step1') / 'original_names.pic') combined_prot = utils.get_pickle(Path('dir_step1') / 'combined_names.pic') ## load all data global all_no_tree, all_trees all_no_tree, all_trees, all_OGs = load_all_data(files_blast_list, files_tree) global prot_2_sp ## load prot_name 2 species dict (string version) prot_2_sp = utils.get_pickle(Path('dir_step2') / 'prot_str_2_species.pic') ## analyse OGs 1 by 1 and save ortho relationships in file print('\n ## analyse ' + str(len(all_OGs)) + ' orthologous groups 1 by 1') multithread_process_OG(all_OGs, nb_threads, original_name, combined_prot, not_same_sp) print(' done\n')
def step3_orthology_network(rov, mw, mnh, lm, nbsp, nt): # convert the parameters to global variables (horrible hack) global sp_overlap, min_weight, min_nb_hits, chimeric_edges, chimeric_species, nb_threads sp_overlap, min_weight, min_nb_hits, chimeric_edges, chimeric_species, nb_threads = rov, mw, mnh, lm, nbsp, nt print('\n --- STEP 3: network analysis\n') print(' ## parameters') print(' species overlap : ' + str(sp_overlap)) print(' min edge weight : ' + str(min_weight)) print(' min nb hits : ' + str(min_nb_hits)) print(' chimeric edges : ' + str(chimeric_edges)) print(' chimeric species : ' + str(chimeric_species)) print(' threads : ' + str(nb_threads)) ## create output directory (or empty it if it already exists) global out_dir out_dir = utils.create_out_dir('dir_step3') ## get all species all_species = utils.get_pickle(Path('dir_step1') / 'species_index.pic') ## check directory files_trees = pre_checking(Path('dir_step2')) ## create log file global log_file log_file = open(out_dir / 'log_step3.txt', 'w+') global prot_2_sp ## load prot_name 2 species dict (string version) prot_2_sp = utils.get_pickle(Path('dir_step2') / 'prot_str_2_species.pic') print('\n ## get ortho and para') ## extract ortho from dir_step2 extract_ortho(files_trees) ## extract para from dir_step2 prot_2_sp = None extract_para(files_trees) ## load prot_name 2 species dict (integer version) prot_2_sp = utils.get_pickle(Path('dir_step2') / 'prot_int_2_species.pic') print('\n ## network analysis') ## build network global all_nodes, all_edges all_nodes, all_edges = build_network() ## load all search outputs print(' load similarity search outputs') global other_hits other_hits = load_search_outputs(Path('dir_step2') / 'dict_output', '_output.pic') ## define maximum number of edges to consider for lcc (= 2 * nb_species or 10 if less species) -> improve speed global limit_degree, limit_nb_max limit_degree, limit_nb_max = get_limit_lcc(all_species) ## calculate the local clustering coefficient of each node global all_lcc all_lcc = multithread_lcc(all_nodes, nb_threads) ## get all connected_components list_cc = utils.get_connected_components(all_edges, all_nodes) ## analyse each connected_components communities, all_chimeric_prot = analyse_cc(list_cc) ## remove spurious hits cleaned_communities = remove_spurious_hits(communities) ## save OG lists, fusions and stats save_outputs(cleaned_communities, all_chimeric_prot, all_species) ## clean dir Path.unlink(out_dir / 'd_ortho.pic') Path.unlink(out_dir / 'd_para.pic') Path.unlink(out_dir / 's_filter.pic') print('')