예제 #1
0
def extract_para(l_trees):

    global set_filter
    set_filter = pickle.load(open(out_dir / 's_filter.pic', 'rb'))
    
    global path_tmp_para
    path_tmp_para  = utils.create_out_dir('./dir_step3/tmp_para')
    
    # extract para from tree files
    print(' extract para from trees')   
    pool = Pool(nb_threads) 
    tmp_res = pool.map_async(extract_para_from_trees, l_trees, chunksize=1)
    pool.close() 
    pool.join()

    # combine results list para
    d_para = utils.get_pickle(out_dir / 'd_ortho.pic')
    d_para = {x:0 for x in d_para}
    for filename in l_trees:
        # load pick file with list para
        content_pickle = pickle.load(open(path_tmp_para / filename, 'rb'))
        for pair in content_pickle:
            try:
                d_para[pair] += 1 
            except:
                pass
                
    # save it to file
    utils.save_pickle(out_dir / 'd_para.pic', d_para)
    
    # free memory
    set_filter = None
    shutil.rmtree(path_tmp_para)
    shutil.rmtree(path_tmp)
예제 #2
0
def extract_ortho(l_trees):
        
    print(' extract ortho from similarity')
    # load pickle files
    tmp_d = utils.get_multi_pickle(Path('dir_step2') / 'dict_similarity_ortho', '_similarity_ortho.pic')

    # extract ortho
    d_ortho = collections.defaultdict(int)
    for l in tmp_d.values():
        for sub in itertools.combinations(l, 2):
            pair_int = int(str(len(sub[0])) + sub[0] + sub[1])
            d_ortho[pair_int] += 1
    
    global path_tmp, path_tmp_ortho
    path_tmp = utils.create_out_dir('./dir_step3/tmp')
    path_tmp_ortho = utils.create_out_dir('./dir_step3/tmp_ortho')
    
    # extract ortho from tree files
    print(' extract ortho from trees')    
    pool = Pool(nb_threads) 
    tmp_res = pool.map_async(extract_ortho_from_trees, l_trees, chunksize=1)
    pool.close() 
    pool.join()            
    
    # unpack ortho and save them
    for filename in l_trees:
        content_pickle = pickle.load(open(path_tmp_ortho / filename, 'rb'))
        for pair in content_pickle:
            d_ortho[pair] += 1 

    # free memory
    content_pickle = None
    shutil.rmtree(path_tmp_ortho)

    # remove ortho found only once
    print(' remove ortho found only once')
    d_ortho = {k:v for k,v in d_ortho.items() if v > 1}
    
    # save it to file
    utils.save_pickle(out_dir / 'd_ortho.pic', d_ortho)

    # save a simplified version (without 2 first digits) as set to file
    s_filter = set()
    for k in d_ortho:
        k2 = str(k)
        s_filter.add( int(k2[2:]) )
    utils.save_pickle(out_dir / 's_filter.pic', s_filter)
예제 #3
0
def step1_kmer_clustering(dir, ext, lk, ma, nt):
   
    # convert the parameters to global variables (horrible hack)
    global directory, extension, length_kmer, min_aa, nb_threads
    directory, extension, length_kmer, min_aa, nb_threads = Path(dir), ext, lk, ma, nt

    print('\n --- STEP 1: kmer clustering\n')
    print(' # parameters')
    print(' input dir     : ' + str(directory))
    print(' kmer size     : ' + str(length_kmer))
    print(' kmer nb aa    : ' + str(min_aa))

    ## create output directory (delete it first if already exists)
    global out_dir
    out_dir = utils.create_out_dir('dir_step1')
    
    ## check directory and files
    print('\n # check input files')
    global dict_files, list_files, list_start
    dict_files, list_files, list_start = pre_checking(directory, extension)
    
	## analyse each fasta file (multithreading)
    print ('\n # kmer clustering\n ' + str(len(list_files)) + ' proteomes on ' + str(nb_threads) + ' threads')
    pool = ThreadPool(nb_threads) 
    tmp_res = pool.map_async(process_file, list_files, chunksize=1)
    results_2 = tmp_res.get()
    pool.close() 
    pool.join()
    
    ## create log files
    log_file = open(out_dir / 'log_step1.txt', 'w+')
    log_file.write('#index	file_name	nb_initial	nb_final\n')
    
    ## save log file and combine other info
    combined = dict()
    names    = dict()
    nb_final = 0
    for l in results_2:
        log_file.write('	'.join(l[:4]) + '\n')
        names.update(l[4])
        combined.update(l[5])
        nb_final += int(l[3])
    
    ## save pickle files
    utils.save_pickle(out_dir / 'combined_names.pic', combined)
    utils.save_pickle(out_dir / 'original_names.pic', names)
    utils.save_pickle(out_dir / 'species_index.pic', dict_files)
    
    print(' -> ' + str(nb_final) + ' proteins saved for the next step')
    print ('')
예제 #4
0
def step2_phylomes(eval, msp, pdia, pfas, tt, pm, nt):

    # convert the parameters to global variables (horrible hack)
    global evalue, max_per_species, path_diamond, path_fasttree, trim_thres, phylo_method, nb_threads
    evalue, max_per_species, path_diamond, path_fasttree, trim_thres, phylo_method, nb_threads = eval, msp, pdia, pfas, tt, pm, nt

    print('\n --- STEP 2: phylomes\n')
    print(' # parameters')
    print(' e_value     : ' + str(evalue))
    print(' nb_hits     : ' + str(max_per_species))
    print(' gaps        : ' + str(trim_thres))
    print(' phylogenies : ' + phylo_method.replace('nj','neighbor joining').replace('me','minimum evolution').replace('ml','maximum likelihood'))
    print(' threads     : ' + str(nb_threads))
       	
    ## create output directory (or empty it if it already exists)
    global out_dir
    out_dir = utils.create_out_dir('dir_step2')
    Path(out_dir / 'dict_trees').mkdir()
    Path(out_dir / 'dict_output').mkdir()
    Path(out_dir / 'dict_similarity_ortho').mkdir()    
    
    ## check directory input data
    global list_files
    print('\n # check input files')
    list_files = pre_checking_data(Path('dir_step1'))
    
    ## load all sequences
    global name_2_sp_phylip_seq, all_species
    name_2_sp_phylip_seq, all_species = create_dict_seq(list_files)

    ## create databases
    global db_dir
    db_dir = out_dir / 'databases'
    db_dir.mkdir(parents=True, exist_ok=True)
    multithread_databases(list_files, nb_threads)
    
    ## process each proteome
    print('\n # build phylomes ... be patient')
    multithread_process_file(list_files, nb_threads)
    
    ## save prot 2 species dict (needed for steps 3 and 4)
    save_prot_2_sp(name_2_sp_phylip_seq)
    
    # delete databases directory 
    shutil.rmtree(db_dir)
    
    print(' done\n')
예제 #5
0
def step4_orthologous_pairs(lo, nsp, nt):

    # convert the parameters to global variables (horrible hack)
    global limit_ortho, not_same_sp, nb_threads
    limit_ortho, not_same_sp, nb_threads = lo, nsp, nt

    print('\n --- STEP 4: orthologous pairs\n')
    print(' ## parameters')
    print(' ratio ortho  : ' + str(limit_ortho))
    print(' not same sp  : ' + str(not_same_sp))
    print(' threads      : ' + str(nb_threads))
    print('\n ## load data')

    ## create output directory (delete it first if already exists)
    global out_dir
    out_dir = utils.create_out_dir('dir_step4')

    ## check directory
    files_blast_list, files_tree = pre_checking(Path('dir_step2'))

    ## get original and combined names
    original_name = utils.get_pickle(Path('dir_step1') / 'original_names.pic')
    combined_prot = utils.get_pickle(Path('dir_step1') / 'combined_names.pic')

    ## load all data
    global all_no_tree, all_trees
    all_no_tree, all_trees, all_OGs = load_all_data(files_blast_list,
                                                    files_tree)

    global prot_2_sp
    ## load prot_name 2 species dict (string version)
    prot_2_sp = utils.get_pickle(Path('dir_step2') / 'prot_str_2_species.pic')

    ## analyse OGs 1 by 1 and save ortho relationships in file
    print('\n ## analyse ' + str(len(all_OGs)) + ' orthologous groups 1 by 1')
    multithread_process_OG(all_OGs, nb_threads, original_name, combined_prot,
                           not_same_sp)

    print(' done\n')
예제 #6
0
def step3_orthology_network(rov, mw, mnh, lm, nbsp, nt):

    # convert the parameters to global variables (horrible hack)
    global sp_overlap, min_weight, min_nb_hits, chimeric_edges, chimeric_species, nb_threads
    sp_overlap, min_weight, min_nb_hits, chimeric_edges, chimeric_species, nb_threads = rov, mw, mnh, lm, nbsp, nt
    
    print('\n --- STEP 3: network analysis\n')
    print(' ## parameters')
    print(' species overlap  : ' + str(sp_overlap))
    print(' min edge weight  : ' + str(min_weight))
    print(' min nb hits      : ' + str(min_nb_hits))
    print(' chimeric edges   : ' + str(chimeric_edges))
    print(' chimeric species : ' + str(chimeric_species))
    print(' threads          : ' + str(nb_threads))

    ## create output directory (or empty it if it already exists)
    global out_dir
    out_dir = utils.create_out_dir('dir_step3')
        
    ## get all species
    all_species = utils.get_pickle(Path('dir_step1') / 'species_index.pic')
    
    ## check directory
    files_trees = pre_checking(Path('dir_step2'))
    
    ## create log file
    global log_file
    log_file = open(out_dir / 'log_step3.txt', 'w+')
        
    global prot_2_sp
    ## load prot_name 2 species dict (string version)
    prot_2_sp = utils.get_pickle(Path('dir_step2') / 'prot_str_2_species.pic')
    
    print('\n ## get ortho and para')
    ## extract ortho from dir_step2
    extract_ortho(files_trees)
    
    ## extract para from dir_step2
    prot_2_sp = None
    extract_para(files_trees)
    
    ## load prot_name 2 species dict (integer version)
    prot_2_sp = utils.get_pickle(Path('dir_step2') / 'prot_int_2_species.pic')
      
    print('\n ## network analysis')
    ## build network
    global all_nodes, all_edges 
    all_nodes, all_edges = build_network()
    
    ## load all search outputs
    print(' load similarity search outputs')
    global other_hits
    other_hits = load_search_outputs(Path('dir_step2') / 'dict_output', '_output.pic')

    ## define maximum number of edges to consider for lcc (= 2 * nb_species or 10 if less species) -> improve speed
    global limit_degree, limit_nb_max
    limit_degree, limit_nb_max = get_limit_lcc(all_species)

    ## calculate the local clustering coefficient of each node
    global all_lcc
    all_lcc = multithread_lcc(all_nodes, nb_threads)

    ## get all connected_components
    list_cc = utils.get_connected_components(all_edges, all_nodes)
    
    ## analyse each connected_components
    communities, all_chimeric_prot = analyse_cc(list_cc)
    
    ## remove spurious hits
    cleaned_communities = remove_spurious_hits(communities)
    
    ## save OG lists, fusions and stats
    save_outputs(cleaned_communities, all_chimeric_prot, all_species) 
    
    ## clean dir
    Path.unlink(out_dir / 'd_ortho.pic')
    Path.unlink(out_dir / 'd_para.pic')
    Path.unlink(out_dir / 's_filter.pic')
    
    print('')