def RNAclusters_align_makeTree(path, folders_dict, parallel, simple_tree):
    """
    create RNA clusters as nucleotide fasta files
    and build individual RNA trees based on fna files
    """

    diamond_RNACluster_dt = create_RNACluster_fa(path, folders_dict)

    ## align, build_tree, make_RNATree_json
    fasta_path = path + 'geneCluster/'
    fa_files = glob.glob(fasta_path + "*RC*.fna")
    multips(single_RNACluster_align_and_makeTree, parallel, fa_files,
            fasta_path, simple_tree)
    ## add RNA cluster in diamond_geneCluster_dt
    ### load gene cluster
    geneClusterPath = '%s%s' % (path, 'protein_faa/diamond_matches/')
    os.system(
        'cp %sallclusters_postprocessed.cpk %s/allclusters_postprocessed.cpk.bk '
        % (geneClusterPath, geneClusterPath))
    diamond_geneCluster_dt = load_pickle(geneClusterPath +
                                         'allclusters_postprocessed.cpk')
    ### update gene cluster with RNA cluster
    update_gene_cluster_with_RNA(path, diamond_RNACluster_dt,
                                 diamond_geneCluster_dt)
    ### update diversity file
    update_diversity_cpk(path)
def postprocess_paralogs_iterative(parallel,
                                   path,
                                   nstrains,
                                   simple_tree,
                                   paralog_branch_cutoff,
                                   disable_long_branch_splitting,
                                   paralog_frac_cutoff=0.3,
                                   plot=0):

    cluster_path = path + 'protein_faa/diamond_matches/'
    clusters_need_split = 'allclusters_postprocessed.cpk' if not disable_long_branch_splitting else 'allclusters.cpk'
    geneCluster_dt = load_pickle(cluster_path + clusters_need_split)
    ## folder that contains old split clusters in paralog splitting step
    geneClusters_fpath = path + 'geneCluster/'
    os.system('mkdir ' + geneClusters_fpath + 'paralog_splits/')
    if os.path.exists(''.join(
        [geneClusters_fpath, 'old_clusters_paralogSplit.txt'])):
        os.system(''.join(
            ['rm ', geneClusters_fpath, 'old_clusters_paralogSplit.txt']))

    split_result = postprocess_paralogs(
        parallel,
        path,
        nstrains,
        simple_tree,
        geneCluster_dt,
        set(),
        paralog_branch_cutoff=paralog_branch_cutoff,
        paralog_frac_cutoff=paralog_frac_cutoff,
        plot=0)
    n_split_clusters, new_fa_files_set = split_result
    iteration = 0
    while (n_split_clusters):
        print '---- split a total of ', n_split_clusters, 'in iteration', iteration
        split_result = postprocess_paralogs(
            parallel,
            path,
            nstrains,
            simple_tree,
            geneCluster_dt,
            new_fa_files_set,
            paralog_branch_cutoff=paralog_branch_cutoff,
            paralog_frac_cutoff=paralog_frac_cutoff,
            plot=plot)
        n_split_clusters, new_fa_files_set = split_result
        iteration += 1

    ## write gene_diversity_Dt cpk file
    update_diversity_cpk(path)

    ## remove old gene cluster and create new split cluster
    update_geneCluster_cpk(path, geneCluster_dt)

    if os.path.exists(''.join(
        [geneClusters_fpath, 'old_clusters_paralogSplit.txt'])):
        with open(geneClusters_fpath + 'old_clusters_paralogSplit.txt',
                  'r') as delete_cluster_file:
            deleted_file_count = len([clus for clus in delete_cluster_file])
            print '#clusters split during the checking paralogy:', deleted_file_count
def RNAclusters_align_makeTree( path, folders_dict, parallel, simple_tree ):
    """
    create RNA clusters as nucleotide fasta files
    and build individual RNA trees based on fna files
    """

    diamond_RNACluster_dt=create_RNACluster_fa(path,folders_dict)

    ## align, build_tree, make_RNATree_json
    fasta_path = path+'geneCluster/'
    fa_files=glob.glob(fasta_path+"*RC*.fna")
    multips(single_RNACluster_align_and_makeTree, parallel, fa_files, fasta_path, simple_tree)
    ## add RNA cluster in diamond_geneCluster_dt
    ### load gene cluster
    geneClusterPath='%s%s'%(path,'protein_faa/diamond_matches/')
    os.system('cp %sallclusters_postprocessed.cpk %s/allclusters_postprocessed.cpk.bk '%(geneClusterPath,geneClusterPath))
    diamond_geneCluster_dt=load_pickle(geneClusterPath+'allclusters_postprocessed.cpk')
    ### update gene cluster with RNA cluster
    update_gene_cluster_with_RNA(path, diamond_RNACluster_dt, diamond_geneCluster_dt)
    ### update diversity file
    update_diversity_cpk(path)
def postprocess_paralogs_iterative(parallel, path, nstrains, simple_tree,
   	paralog_branch_cutoff, disable_long_branch_splitting, paralog_frac_cutoff=0.3, plot=0):

    cluster_path= path+'protein_faa/diamond_matches/'
    clusters_need_split='allclusters_postprocessed.cpk' if not disable_long_branch_splitting else 'allclusters.cpk'
    geneCluster_dt=load_pickle(cluster_path+clusters_need_split)
    ## folder that contains old split clusters in paralog splitting step
    geneClusters_fpath=path+'geneCluster/'
    os.system('mkdir '+geneClusters_fpath+'paralog_splits/')
    if os.path.exists(''.join([geneClusters_fpath,'old_clusters_paralogSplit.txt'])):
        os.system(''.join(['rm ',geneClusters_fpath,'old_clusters_paralogSplit.txt']))

    split_result= postprocess_paralogs( parallel, path, nstrains, simple_tree,
                                            geneCluster_dt, set(),
                                            paralog_branch_cutoff=paralog_branch_cutoff,
                                            paralog_frac_cutoff=paralog_frac_cutoff, plot=0)
    n_split_clusters, new_fa_files_set = split_result
    iteration=0
    while(n_split_clusters):
        print '---- split a total of ',n_split_clusters, 'in iteration', iteration
        split_result= postprocess_paralogs( parallel, path, nstrains, simple_tree,
                                                geneCluster_dt, new_fa_files_set,
                                                paralog_branch_cutoff=paralog_branch_cutoff,
                                                paralog_frac_cutoff=paralog_frac_cutoff, plot=plot)
        n_split_clusters, new_fa_files_set = split_result
        iteration+=1

    ## write gene_diversity_Dt cpk file
    update_diversity_cpk(path)

    ## remove old gene cluster and create new split cluster
    update_geneCluster_cpk(path, geneCluster_dt)

    if os.path.exists(''.join([geneClusters_fpath,'old_clusters_paralogSplit.txt'])):
        with open(geneClusters_fpath+'old_clusters_paralogSplit.txt', 'r') as delete_cluster_file:
            deleted_file_count=len([ clus for clus in delete_cluster_file ])
            print '#clusters split during the checking paralogy:',deleted_file_count
def postprocess_split_long_branch(parallel,
                                  path,
                                  simple_tree,
                                  cut_branch_threshold=0.3):
    """
    Split tree via breaking up long branches.
    Remote homology leads to over-clustering. This yields tree with long branches.
    """

    file_path = ''.join([path, 'geneCluster/'])
    new_split_folder = ''.join([file_path, 'update_long_branch_splits/'])
    if os.path.exists(new_split_folder):
        ## remove the folder from previous run
        os.system(''.join(['rm -r ', new_split_folder]))
    os.system(''.join(['mkdir ', new_split_folder]))
    deleted_clusters_folder = ''.join(
        [file_path, 'deleted_clusters_longSplit/'])
    if os.path.exists(deleted_clusters_folder):
        os.system(''.join(['rm -r ', deleted_clusters_folder]))
    os.system(''.join(['mkdir ', deleted_clusters_folder]))

    ## load clusters
    cluster_path = '%s%s' % (path, 'protein_faa/diamond_matches/')
    geneCluster_dt = load_pickle(cluster_path + 'allclusters.cpk')

    ## gather all trees generated before postprocessing
    tree_path = file_path
    tree_fname_list = glob.glob(tree_path + '*nwk')

    ## ensure that writing to new_clusters_longSplit starts at the beginning (for re-running)
    if os.path.exists(''.join([file_path, 'new_clusters_longSplit.txt'])):
        os.system(''.join(['rm ', file_path, 'new_clusters_longSplit.txt']))
    if os.path.exists(''.join([file_path, 'old_clusters_longSplit.txt'])):
        os.system(''.join(['rm ', file_path, 'old_clusters_longSplit.txt']))

    # =============================================
    # parallelization:
    # "post-clustering workflow for splitting trees on over-clustered records"
    treefile_used = True
    multips(cutTree_outputCluster, parallel, tree_fname_list, file_path,
            cut_branch_threshold, treefile_used)

    ## If new_clusters_longSplit.txt (over_split records) exists,
    ## then gather new clusters from new_clusters_longSplit.txt
    if os.path.exists(''.join([file_path, 'new_clusters_longSplit.txt'])):
        with open(file_path + 'new_clusters_longSplit.txt',
                  'rb') as new_clusters_longSplit:
            new_fa_files_list = [
                clus.rstrip() for clus in new_clusters_longSplit
            ]
            print '#times of splitting long branches:', len(
                new_fa_files_list) - 1
        with open(file_path + 'old_clusters_longSplit.txt',
                  'rb') as delete_cluster_file:
            deleted_file_count = len([clus for clus in delete_cluster_file])
            print '#clusters split during the checking of long branches:', deleted_file_count

        ## parallelization of "align and make tree on new cluster"
        multips(align_and_makeTree, parallel, new_fa_files_list, file_path,
                simple_tree)
        # =============================================

        ## delete original clusters which are split
        delete_original_clusters(file_path, geneCluster_dt)
        ## add newly split clusters
        update_geneCluster_dt(path, geneCluster_dt)
        ## write updated gene clusters in cpk file
        update_geneCluster_cpk(path, geneCluster_dt)
        ## write gene_diversity_Dt cpk file
        update_diversity_cpk(path)

        os.system(' '.join([
            'mv ', file_path + 'new_clusters_longSplit.txt',
            file_path + 'added_clusters_split_long.txt'
        ]))
        os.system(' '.join([
            'mv ', file_path + 'old_clusters_longSplit.txt',
            file_path + 'deleted_clusters_split_long.txt'
        ]))
    else:  # no clusters postprocessed
        os.system(' '.join([
            'cp', cluster_path + 'allclusters.cpk',
            cluster_path + 'allclusters_postprocessed.cpk'
        ]))
def postprocess_split_long_branch(parallel, path, simple_tree, cut_branch_threshold=0.3):
    """
    Split tree via breaking up long branches.
    Remote homology leads to over-clustering. This yields tree with long branches.
    """

    file_path = ''.join([path,'geneCluster/'])
    new_split_folder= ''.join([file_path,'update_long_branch_splits/'])
    if os.path.exists(new_split_folder):
        ## remove the folder from previous run
        os.system(''.join(['rm -r ',new_split_folder]))
    os.system(''.join(['mkdir ',new_split_folder]))
    deleted_clusters_folder=''.join([file_path,'deleted_clusters_longSplit/'])
    if os.path.exists(deleted_clusters_folder):
        os.system(''.join(['rm -r ',deleted_clusters_folder]))
    os.system(''.join(['mkdir ',deleted_clusters_folder]))

    ## load clusters
    cluster_path='%s%s'%(path,'protein_faa/diamond_matches/')
    geneCluster_dt=load_pickle(cluster_path+'allclusters.cpk')

    ## gather all trees generated before postprocessing
    tree_path = file_path
    tree_fname_list =glob.glob(tree_path+'*nwk')

    ## ensure that writing to new_clusters_longSplit starts at the beginning (for re-running)
    if os.path.exists(''.join([file_path,'new_clusters_longSplit.txt'])):
        os.system(''.join(['rm ',file_path,'new_clusters_longSplit.txt']))
    if os.path.exists(''.join([file_path,'old_clusters_longSplit.txt'])):
        os.system(''.join(['rm ',file_path,'old_clusters_longSplit.txt']))

    # =============================================
    # parallelization:
    # "post-clustering workflow for splitting trees on over-clustered records"
    treefile_used=True
    multips(cutTree_outputCluster, parallel, tree_fname_list, file_path, cut_branch_threshold, treefile_used)

    ## If new_clusters_longSplit.txt (over_split records) exists,
    ## then gather new clusters from new_clusters_longSplit.txt
    if os.path.exists(''.join([file_path,'new_clusters_longSplit.txt'])):
        with open(file_path+'new_clusters_longSplit.txt', 'rb') as new_clusters_longSplit:
            new_fa_files_list=[ clus.rstrip() for clus in new_clusters_longSplit ]
            print '#times of splitting long branches:',len(new_fa_files_list)-1
        with open(file_path+'old_clusters_longSplit.txt', 'rb') as delete_cluster_file:
            deleted_file_count=len([ clus for clus in delete_cluster_file ])
            print '#clusters split during the checking of long branches:',deleted_file_count

        ## parallelization of "align and make tree on new cluster"
        multips(align_and_makeTree, parallel, new_fa_files_list, file_path, simple_tree)
        # =============================================

        ## delete original clusters which are split
        delete_original_clusters(file_path, geneCluster_dt)
        ## add newly split clusters
        update_geneCluster_dt(path,geneCluster_dt)
        ## write updated gene clusters in cpk file
        update_geneCluster_cpk(path, geneCluster_dt)
        ## write gene_diversity_Dt cpk file
        update_diversity_cpk(path)

        os.system(' '.join(['mv ',file_path+'new_clusters_longSplit.txt' ,file_path+'added_clusters_split_long.txt' ]))
        os.system(' '.join(['mv ',file_path+'old_clusters_longSplit.txt', file_path+'deleted_clusters_split_long.txt']))
    else: # no clusters postprocessed
        os.system(' '.join(['cp',cluster_path+'allclusters.cpk',cluster_path+'allclusters_postprocessed.cpk']))
def postprocess_unclustered_genes(parallel,
                                  path,
                                  nstrains,
                                  simple_tree,
                                  split_long_branch_cutoff,
                                  window_size_smoothed=5,
                                  strain_proportion=0.3,
                                  sigma_scale=3):
    """
        1) detect suspicious peaks in the distribution of average length of genes in gene cluster (aa count)
           np.bincount([1,2,3,34,3]) -> how often each entry is found
           np.convolve([1,1,1,1,1], gene_length_count)
          ->  unclustered genes will contribute many small clusters (size 1)
              that result in peaks in the distribution
        2) for each peak detected, align the sequences of all genes in clusters in peak
        3) to cluster aligned genes, build tree. However, to ensure long branches
          ->  between unaligned sub-alignment, fill gaps with random sequence (skipped, not tested)
              importantly, this random sequence needs to be the same in different columns of the alignment.
                  - random sequence a la rseq = [alpha[ii] for ii in np.random.randint(len(alpha), size=aln.get_aligmentlength())]
                  - for seq in aln: seq[seq=='-'] = rseq[seq=='-']
        4) make and split tree at branches > 0.5
        5) for each subtree (ideally only one big tree), define new gene cluster and run
           maketree_align from standard step 6
    """

    geneCluster_fasta_path = ''.join([path, 'geneCluster/'])
    new_split_folder = ''.join(
        [geneCluster_fasta_path, 'update_long_branch_splits/'])
    if os.path.exists(new_split_folder):
        ## remove the folder from previous run
        os.system(''.join(['rm -r ', new_split_folder]))
    os.system(''.join(['mkdir ', new_split_folder]))

    deleted_clusters_folder = ''.join(
        [geneCluster_fasta_path, 'deleted_clusters_peaks_splits/'])
    if os.path.exists(deleted_clusters_folder):
        os.system(''.join(['rm -r ', deleted_clusters_folder]))
    os.system(''.join(['mkdir ', deleted_clusters_folder]))

    ## load clusters
    ClusterPath = '%s%s' % (path, 'protein_faa/diamond_matches/')
    geneCluster_dt = load_pickle(ClusterPath + 'allclusters_postprocessed.cpk')

    ## merge unclustered genes
    merged_clusters_dict = defaultdict(list)
    merged_clusters_dict = find_and_merge_unclustered_genes(
        path, nstrains, window_size_smoothed, strain_proportion, sigma_scale)

    if len(merged_clusters_dict) != 0:
        ## there are merged clusters corresponding to the cluster peaks

        ## ensure that writing to new_clusters_longSplit starts at the beginning (for re-running)
        if os.path.exists(''.join(
            [geneCluster_fasta_path, 'new_clusters_longSplit.txt'])):
            os.system(''.join(
                ['rm ', geneCluster_fasta_path, 'new_clusters_longSplit.txt']))
        if os.path.exists(''.join(
            [geneCluster_fasta_path, 'old_clusters_longSplit.txt'])):
            os.system(''.join(
                ['rm ', geneCluster_fasta_path, 'old_clusters_longSplit.txt']))

        cut_branch_threshold = split_long_branch_cutoff  #0.3
        ## cut tree and make new clusters
        cut_all_trees_from_merged_clusters(parallel, path,
                                           cut_branch_threshold, simple_tree)

        ## update clusters in allclusters_final.cpk
        #os.system('cp %sallclusters_final.cpk %s/allclusters_final.cpk.bk '%(ClusterPath,ClusterPath))

        ## delete old clusters
        delete_old_merged_clusters(geneCluster_fasta_path, geneCluster_dt,
                                   merged_clusters_dict)
        ## add newly split clusters
        update_geneCluster_dt(path, geneCluster_dt)
        ## write updated gene clusters in cpk file
        update_geneCluster_cpk(path, geneCluster_dt)
        ## write gene_diversity_Dt cpk file
        update_diversity_cpk(path)
def postprocess_unclustered_genes(parallel, path, nstrains, simple_tree, split_long_branch_cutoff,
    window_size_smoothed=5, strain_proportion=0.3 , sigma_scale=3):
    """
        1) detect suspicious peaks in the distribution of average length of genes in gene cluster (aa count)
           np.bincount([1,2,3,34,3]) -> how often each entry is found
           np.convolve([1,1,1,1,1], gene_length_count)
          ->  unclustered genes will contribute many small clusters (size 1)
              that result in peaks in the distribution
        2) for each peak detected, align the sequences of all genes in clusters in peak
        3) to cluster aligned genes, build tree. However, to ensure long branches
          ->  between unaligned sub-alignment, fill gaps with random sequence (skipped, not tested)
              importantly, this random sequence needs to be the same in different columns of the alignment.
                  - random sequence a la rseq = [alpha[ii] for ii in np.random.randint(len(alpha), size=aln.get_aligmentlength())]
                  - for seq in aln: seq[seq=='-'] = rseq[seq=='-']
        4) make and split tree at branches > 0.5
        5) for each subtree (ideally only one big tree), define new gene cluster and run
           maketree_align from standard step 6
    """

    geneCluster_fasta_path = ''.join([path,'geneCluster/'])
    new_split_folder= ''.join([geneCluster_fasta_path,'update_long_branch_splits/'])
    if os.path.exists(new_split_folder):
        ## remove the folder from previous run
        os.system(''.join(['rm -r ',new_split_folder]))
    os.system(''.join(['mkdir ',new_split_folder]))

    deleted_clusters_folder=''.join([geneCluster_fasta_path,'deleted_clusters_peaks_splits/'])
    if os.path.exists(deleted_clusters_folder):
        os.system(''.join(['rm -r ',deleted_clusters_folder]))
    os.system(''.join(['mkdir ',deleted_clusters_folder]))

    ## load clusters
    ClusterPath='%s%s'%(path,'protein_faa/diamond_matches/')
    geneCluster_dt=load_pickle(ClusterPath+'allclusters_postprocessed.cpk')

    ## merge unclustered genes
    merged_clusters_dict=defaultdict(list)
    merged_clusters_dict=find_and_merge_unclustered_genes(path, nstrains, window_size_smoothed, strain_proportion , sigma_scale)

    if len(merged_clusters_dict)!=0:
        ## there are merged clusters corresponding to the cluster peaks

        ## ensure that writing to new_clusters_longSplit starts at the beginning (for re-running)
        if os.path.exists(''.join([geneCluster_fasta_path,'new_clusters_longSplit.txt'])):
            os.system(''.join(['rm ',geneCluster_fasta_path,'new_clusters_longSplit.txt']))
        if os.path.exists(''.join([geneCluster_fasta_path,'old_clusters_longSplit.txt'])):
            os.system(''.join(['rm ',geneCluster_fasta_path,'old_clusters_longSplit.txt']))

        cut_branch_threshold=split_long_branch_cutoff#0.3
        ## cut tree and make new clusters
        cut_all_trees_from_merged_clusters(parallel, path, cut_branch_threshold, simple_tree)

        ## update clusters in allclusters_final.cpk
        #os.system('cp %sallclusters_final.cpk %s/allclusters_final.cpk.bk '%(ClusterPath,ClusterPath))

        ## delete old clusters
        delete_old_merged_clusters(geneCluster_fasta_path, geneCluster_dt, merged_clusters_dict)
        ## add newly split clusters
        update_geneCluster_dt(path,geneCluster_dt)
        ## write updated gene clusters in cpk file
        update_geneCluster_cpk(path,geneCluster_dt)
        ## write gene_diversity_Dt cpk file
        update_diversity_cpk(path)