Exemplo n.º 1
0
def get_temp_dirs(ali_filename, OutDirName):

    repbppconfig = OutDirName + "bpp_config"
    if not os.path.exists(repbppconfig):
        os.mkdir(repbppconfig)

    bpp_lib.write_config(repbppconfig, estim=True)

    repseq = os.path.dirname(ali_filename)
    if not repseq:
        repseq = "."

    # logger.info("alignment directory: %s", repseq)

    ali_basename = os.path.basename(ali_filename)
    repest0 = OutDirName + "/Estimations"
    reptree0 = OutDirName + "/Trees"
    repfasta0 = OutDirName + "/fasta"

    if not os.path.exists(repest0):
        os.mkdir(repest0)
    if not os.path.exists(reptree0):
        os.mkdir(reptree0)
    if not os.path.exists(repfasta0):
        os.mkdir(repfasta0)

    return repbppconfig, repseq, ali_basename, repest0, reptree0, repfasta0
Exemplo n.º 2
0
#    cpus = 1   # arbitrary default
#logger.info("%s on %s cpus", cpu, cpus)

if args.LD_LIB:
    logger.info("$LD_LIBRARY_PATH will be change from %s to %s",
                os.environ.get("LD_LIBRARY_PATH", ""), args.LD_LIB)
    os.environ["LD_LIBRARY_PATH"] = args.LD_LIB
else:
    logger.debug("$LD_LIBRARY_PATH is %s",
                 os.environ.get("LD_LIBRARY_PATH", ""))

repbppconfig = OutDirName + "bpp_config"
if not os.path.exists(repbppconfig):
    os.mkdir(repbppconfig)

bpp_lib.write_config(repbppconfig, estim=True)

#http://stackoverflow.com/questions/23172293/use-python-to-extract-branch-lengths-from-newick-format
pattern = re.compile(r"\b[0-9]+(?:\.[0-9]+)?\b")

logger.info("alignment: %s", args.ali)
ali_filename = args.ali

l_n_sites = []

if os.path.isfile(ali_filename):
    ali = AlignIO.read(ali_filename, "fasta")
    #check alphabet
    alphabet = {}
    error = {}
    n_tot = 0
Exemplo n.º 3
0
    cpus = 1   # arbitrary default
logger.info("%s on %s cpus", cpu, cpus)

if args.LD_LIB:
    logger.info("$LD_LIBRARY_PATH will be change from %s to %s", os.environ.get("LD_LIBRARY_PATH", ""), args.LD_LIB)
    os.environ["LD_LIBRARY_PATH"]=args.LD_LIB
else:
    logger.debug("$LD_LIBRARY_PATH is %s", os.environ.get("LD_LIBRARY_PATH", ""))


repbppconfig = OutDirName +  "bpp_config"
if not os.path.exists(repbppconfig):
    os.mkdir(repbppconfig)


bpp_lib.write_config(repbppconfig, estim=True, NbCat = args.CATX_sim)
dist_C1_C2 =  pd.read_csv(repbppconfig+'/CATC'+str(args.CATX_sim)+'Distances.csv', index_col=0)

#MODIFIED 20190301 JRW
# if a tree *directory* is passed
if os.path.isdir(args.tree_dir):
    # get all the files in the dir
    lnf = glob.glob(args.tree_dir+"/*")
# if a single file is passed
else:
    # make it a list of 1 filename
    lnf = [args.tree_dir,]

flg=args.flg
if flg == 1:
    logger.info("Branch length multiplicator:\tno")
Exemplo n.º 4
0
def main(args):

    metadata_run_dico = {}
    date = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
    OutDirName = "%s/RUN_%s/" %(args.output_dir, date)
    OutDirName = OutDirName.replace("//","/")

    metadata_run_dico["RunID"] = date

    ### Set up the output directory
    if os.path.isdir(OutDirName):
        pass
        #logger.info("The output directory %s exists", OutDirName)
    elif OutDirName: # if OutDirName is not a empty string we create the directory
        #logger.info("The output directory %s does not exist, it will be created", OutDirName)
        os.makedirs(OutDirName)

    ### Set up the log file
    LogFile = OutDirName + "/pcoc_sim.log"

    ### Set up the logger
    # create file handler which logs even debug messages
    fh = logging.FileHandler(LogFile)
    # create console handler with a higher log level
    ch = logging.StreamHandler()
    if args.debug:
        ch.setLevel(logging.DEBUG)
        fh.setLevel(logging.DEBUG)
    else:
        ch.setLevel(logging.INFO)
        fh.setLevel(logging.INFO)
    # create formatter and add it to the handlers
    formatter_fh = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    formatter_ch = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    fh.setFormatter(formatter_fh)
    ch.setFormatter(formatter_ch)
    # add the handlers to the logger
    logger.addHandler(fh)
    logger.addHandler(ch)

    logger.debug(sys.argv)

    cpu = args.cpu
    try:
        cpus = multiprocessing.cpu_count()
    except NotImplementedError:
        cpus = 1   # arbitrary default
    logger.info("%s on %s cpus", cpu, cpus)

    if args.LD_LIB:
        logger.info("$LD_LIBRARY_PATH will be change from %s to %s", os.environ.get("LD_LIBRARY_PATH", ""), args.LD_LIB)
        os.environ["LD_LIBRARY_PATH"]=args.LD_LIB
    else:
        logger.debug("$LD_LIBRARY_PATH is %s", os.environ.get("LD_LIBRARY_PATH", ""))


    repbppconfig = OutDirName +  "bpp_config"
    if not os.path.exists(repbppconfig):
        os.mkdir(repbppconfig)


    bpp_lib.write_config(repbppconfig, estim=True, NbCat = args.CATX_sim)
    dist_C1_C2 =  pd.read_csv(repbppconfig+'/CATC'+str(args.CATX_sim)+'Distances.csv', index_col=0)

    lnf=glob.glob(args.tree_dir+"/*")

    flg=args.flg
    if flg == 1:
        logger.info("Branch length multiplicator:\tno")
        metadata_run_dico["Branch length multiplicator"] = "no"
    else:
        logger.info("Branch length multiplicator:\t%s", flg)
        metadata_run_dico["Branch length multiplicator"] = flg

    bl_new=args.bl_new
    if bl_new >0:
        logger.info("Branch length remplacement:\t%s", bl_new)
        metadata_run_dico["Branch length remplacement"] = bl_new
    else:
        bl_new = -1
        metadata_run_dico["Branch length remplacement"] = "no"



    #http://stackoverflow.com/questions/23172293/use-python-to-extract-branch-lengths-from-newick-format
    pattern = re.compile(r"\b[0-9]+(?:\.[0-9]+)?\b")


    logger.debug("trees:\n  * %s", "\n  * ".join(lnf))
    nb_input_tree_before = len(lnf)
    logger.debug("%s trees in %s", nb_input_tree_before, args.tree_dir)


    for treefilename in lnf:
        # test if a tree
        try:
            t=Tree(treefilename)
        except:
            logger.warning("%s is not a newick tree, this tree will not be used",treefilename)
            lnf.remove(treefilename)
            t=""
        if t:
            treefile=open(treefilename,"r")
            tree=treefile.read().strip()
            treefile.close()
            #test if branch length
            branch_lengths = pattern.findall(tree)
            if branch_lengths == []:
                logger.warning("No branch length in %s, this tree will not be used",treefilename)
                lnf.remove(treefilename)

    nb_input_tree_after = len(lnf)
    if nb_input_tree_after!= nb_input_tree_before:
        logger.warning("%s trees in %s after checking (%s before)", nb_input_tree_after, args.tree_dir, nb_input_tree_before)

    logger.debug("trees:\n  * %s", "\n  * ".join(lnf))

    logger.info("Number of input trees:\t%s", nb_input_tree_after)
    metadata_run_dico["Number of input trees"] = nb_input_tree_after

    if len(lnf) == 0:
        logger.error("No tree. Bye.")
        sys.exit(1)

    if len(lnf) > 1 and args.manual_mode:
        logger.error("Only 1 tree if manual mode.")
        sys.exit(1)

    metadata_run_dico["Input trees"] = ",".join([os.path.basename(t) for t in lnf])

    Nbsimul=args.n_sc
    maxTrans=args.c_max
    minTrans=args.c_min
    maxConvRate=args.cr
    Nsites=args.n_sites


    ev_noise = args.ev_noise
    if ev_noise:
        logger.info("Add event placement noise: %s" ,ev_noise)
        if not args.c:
            logger.error("-c must be also used if you use --ev_noise")
            sys.exit(1)
        if ev_noise == "+1":
            if maxTrans < (args.c + 1):
                logger.error("maxTrans must be superior to -c if you use --ev_noise +1 ")
                sys.exit(1)
            pass
        elif ev_noise == "-1":
            if minTrans >= (args.c - 1):
                logger.error("minTrans must be inferior or equal to -c -1 if you use --ev_noise -1 ")
                sys.exit(1)
            pass
        elif ev_noise == "=1":
            pass

    root_noise = args.root_noise
    if root_noise:
        logger.info("Add root noise: %s" ,root_noise)

    metadata_run_dico["AliNoise"]  = "No"
    metadata_run_dico["BlNoise"]   = "No"
    metadata_run_dico["EvNoise"]   = "No"
    metadata_run_dico["RootNoise"] = "No"

    if args.ali_noise:
        metadata_run_dico["AliNoise"] = "Yes"
    if args.bl_noise:
        metadata_run_dico["BlNoise"] = "Yes"
    if ev_noise:
        metadata_run_dico["EvNoise"] = ev_noise
    if root_noise:
        metadata_run_dico["RootNoise"] = root_noise

    manual_mode_nodes = {}
    if args.manual_mode:
        manual_mode_nodes = {"T":[],"C":[]}
        p_events = args.manual_mode.strip().split("/")
        for e in p_events:
            l_e = map(int,e.split(","))
            manual_mode_nodes["T"].append(l_e[0])
            manual_mode_nodes["C"].extend(l_e[1:])
        if args.c_min > len(manual_mode_nodes["T"]):
            minTrans=len(manual_mode_nodes["T"])
        if args.c_max != len(manual_mode_nodes["T"]):
            maxTrans = len(manual_mode_nodes["T"])



    metadata_run_dico["Number of scenarios per input tree"] = Nbsimul
    metadata_run_dico["Maximum number of convergent events"] = maxTrans
    metadata_run_dico["Minimum number of convergent events"] = minTrans
    metadata_run_dico["Maximum rate of the number of Convergent/Non-convergent leaves"] = maxConvRate
    metadata_run_dico["Number of simulated sites"] = Nsites

    logger.info("Number of scenarios per input tree (= 1 tree and 1 set of convergent events):\t%s", Nbsimul)
    logger.info("Number of simulated sites:\t%s", Nsites)
    logger.info("Maximum number of convergent events:\t%s", maxTrans)
    logger.info("Minimum number of convergent events:\t%s", minTrans)
    logger.info("Maximum rate of the number of Convergent/Non-convergent leaves:\t%s", maxConvRate)


    NbCat_Sim = args.CATX_sim
    NbCat_Est = args.CATX_est

    MinDistCAT = args.min_dist_CAT
    Nb_sampled_couple = args.nb_sampled_couple

    metadata_run_dico["Profile categories use during simulation"] = NbCat_Sim
    metadata_run_dico["Profile categories use during estimation"] = NbCat_Est
    metadata_run_dico["Number of sampled profile couples per scenario"] = Nb_sampled_couple
    metadata_run_dico["Minimum distance between 2 profiles of a couple to be use for simulations"] = MinDistCAT

    pcoc_str = "No"
    topo_str = "No"
    ident_str = "No"
    if args.pcoc:
        pcoc_str = "Yes"
    if args.topo:
        topo_str = "Yes"
    if args.ident:
        ident_str = "Yes"
    logger.info("Run PCOC method:\t%s", pcoc_str)
    logger.info("Run topological method:\t%s", topo_str)
    logger.info("Run identical method:\t%s", ident_str)
    metadata_run_dico["Run PCOC method"] = pcoc_str
    metadata_run_dico["Run topological method"] = topo_str
    metadata_run_dico["Run identical method"] = ident_str

    pd.Series(metadata_run_dico).to_csv(OutDirName + "/run_metadata.tsv", sep='\t')

    ### Choose CAT profiles

    CATcouples = []
    if MinDistCAT > 0:
        with open(repbppconfig+'/CATC'+str(NbCat_Sim)+'Distances.csv', 'rb') as csvfile:
            reader = csv.reader(csvfile, delimiter=',', quotechar='"')
            first_line = True
            for row in reader:
                #print row
                if first_line:
                    first_line = False
                    colnames = row
                    colnames.pop(0)
                else:
                    C1 = int(row.pop(0).replace("C",""))
                    for C2 in colnames:
                        C2 = int(C2.replace("C",""))
                        d=float(row.pop(0))
                        if d >= MinDistCAT:
                            CATcouples.append((C1,C2))
                        else:
                            #print("rejecte %s %s" %(C1,C2))
                            pass
    else:
        for C1 in range(1,NbCat_Sim+1):
            for C2 in range(1,NbCat_Sim+1):
                if C1 != C2:
                    CATcouples.append((C1,C2))

    if not Nb_sampled_couple:
        Nb_sampled_couple = len(CATcouples)
    elif Nb_sampled_couple > len(CATcouples):
        Nb_sampled_couple = len(CATcouples)

    logger.info("Profile category uses during simulation:\t%s", NbCat_Sim)
    logger.info("Profile category uses during estimation:\t%s", NbCat_Est)
    logger.info("Minimum distance between 2 profiles of a couple to be use for simulations:\t%s", MinDistCAT)
    logger.info("Number of sampled profile couples per scenario:\t%s", Nb_sampled_couple)

    num_tree = 1
    metadata_tree_dico = {}
    for tree_filename in lnf:
        start_tree_time = time.time()
        logger.info("START: %s", os.path.basename(tree_filename))
        OutDirNamePrefixTree = "%s/Tree_%s/" % (OutDirName, num_tree)
        repseq0 = OutDirNamePrefixTree + "/sequences"
        repest0 = OutDirNamePrefixTree + "/estimations"
        reptree0 = OutDirNamePrefixTree + "/nw_trees"
        repplottreeali0 = OutDirNamePrefixTree + "/plot_tree_ali"
        replikelihoodsummary0 = OutDirNamePrefixTree + "/likelihood_summaries"

        if not os.path.exists(OutDirNamePrefixTree):
            os.mkdir(OutDirNamePrefixTree)
        if not os.path.exists(repseq0):
            os.mkdir(repseq0)
        if not os.path.exists(repest0):
            os.mkdir(repest0)
        if not os.path.exists(reptree0):
            os.mkdir(reptree0)
        if not os.path.exists(repplottreeali0) and args.plot_ali:
            os.mkdir(repplottreeali0)
        if not os.path.exists(replikelihoodsummary0) and args.get_likelihood_summaries:
            os.mkdir(replikelihoodsummary0)

        # list for multiprocessing to unpack
        list_to_map = [(i + 1, tree_filename, OutDirNamePrefixTree, args) for i in range(Nbsimul)]
        if cpu == 1:
            r = []
            for x in list_to_map:
                r.append(mk_simu(x))
        else:
            p = multiprocessing.Pool(processes=cpus)
            pool_results = p.map_async(mk_simu, list_to_map)
            pool_results.wait()
            r = pool_results.get()

        metada_simu_global = []
        metada_simu_per_couple_het = []
        metada_simu_per_couple_topo = []
        metada_simu_per_couple_sub = []

        bilan_nodesWithTransitions = []
        for z in r:
            if z != 0:
                x, y_het, y_topo, y_sub, nodesWithTransitions = z
                metada_simu_global.append(x)
                metada_simu_per_couple_het.extend(y_het)
                metada_simu_per_couple_topo.extend(y_topo)
                metada_simu_per_couple_sub.extend(y_sub)
                bilan_nodesWithTransitions.extend(nodesWithTransitions)

        ### Write metada on the tree
        metada_simu = pd.DataFrame(metada_simu_global)
        sorted_colums = ["ScenarioID"] + [c for c in metada_simu.columns if c != "ScenarioID"]
        metada_simu = metada_simu.reindex_axis(sorted_colums, axis=1)
        metada_simu.to_csv(OutDirNamePrefixTree + "/MetadataScenarios.tsv", sep='\t', index=False)

        metada_simu_het = pd.DataFrame()
        metada_simu_topo = pd.DataFrame()
        metada_simu_sub = pd.DataFrame()
        if args.pcoc:
            metada_simu_het = pd.DataFrame(metada_simu_per_couple_het)
            # metada_simu.to_csv(OutDirNamePrefixTree + "/Scenarios_metadata_per_couple_mod_het.tsv", sep='\t', index=False)
        if args.topo:
            metada_simu_topo = pd.DataFrame(metada_simu_per_couple_topo)
            # metada_simu.to_csv(OutDirNamePrefixTree + "/Scenarios_metadata_per_couple_topo.tsv", sep='\t', index=False)
        if args.ident:
            metada_simu_sub = pd.DataFrame(metada_simu_per_couple_sub)
            # metada_simu.to_csv(OutDirNamePrefixTree + "/Scenarios_metadata_per_couple_obs_sub.tsv", sep='\t', index=False)

        df_concat = [df for df in [metada_simu_het, metada_simu_topo, metada_simu_sub] if not df.empty]
        if df_concat:
            df_cat = pd.concat(df_concat)
            # df_cat = df_cat["RunID","InputTree","ScenarioID","SimuCoupleID","C1","C2","DistanceSimuCouple","Method","Threshold","FN","FP","TN","TP","Sensitivity","Specificity","MCC","NumberOfConvergentEvents","NumberOfSites","PosteriorProbabilityType"]]
            df_cat = df_cat[
                ["RunID", "InputTree", "ScenarioID", "SimuCoupleID", "C1", "C2", "DistanceSimuCouple", "Method",
                 "Threshold", "FN", "FP", "TN", "TP", "Sensitivity", "Specificity", "MCC", "NumberOfConvergentEvents",
                 "NumberOfSites"]]
            df_cat.to_csv(OutDirNamePrefixTree + "/BenchmarkResults.tsv", sep='\t', index=False)

        if args.plot_event_repartition:
            plot_data.mk_bilan_tree(events_placing.init_tree(tree_filename), bilan_nodesWithTransitions,
                                    OutDirNamePrefixTree + "/Tree_" + str(num_tree) + ".pdf")

            # script_dirname = os.path.dirname(os.path.abspath(__file__))
            # R_command="Rscript %s/rscripts/mk_sens_spe_MCC_plot.R %s %s" %(script_dirname, os.environ['PWD'] + "/" + OutDirNamePrefixTree, os.environ['PWD'] + "/" +OutDirNamePrefixTree)
            # logger.info(R_command)
            # p = subprocess.Popen(R_command.split(" "))
            # (out_p, err_p) = p.communicate()
            # logger.debug("out: %s \n err: %s", out_p, err_p)

        metadata_tree_dico["Tree_%s" % (num_tree)] = [os.path.basename(tree_filename),
                                                      str(time.time() - start_tree_time)]

        logger.info("END: %s (%s seconds)", os.path.basename(tree_filename), str(time.time() - start_tree_time))
        logger.info("%s/%s scenario succeded", len(r), Nbsimul)
        num_tree += 1

    ### Write metada on the run
    ## Reorder metadata dict
    restructured_metadata_tree_dico = {}
    restructured_metadata_tree_dico["Tree_#"] = metadata_tree_dico.keys()
    restructured_metadata_tree_dico["Tree_filename"] = [metadata_tree_dico[k][0] for k in
                                                        restructured_metadata_tree_dico["Tree_#"]]
    restructured_metadata_tree_dico["Execution time"] = [metadata_tree_dico[k][1] for k in
                                                         restructured_metadata_tree_dico["Tree_#"]]
    df_tree = pd.DataFrame.from_dict(restructured_metadata_tree_dico)
    ## Reorder columns
    df_tree = df_tree[["Tree_#", "Tree_filename", "Execution time"]]

    df_tree.to_csv(OutDirName + "/tree_metadata.tsv", sep='\t', index=False)

    if not args.no_cleanup:
        remove_folder(repbppconfig)
        remove_folder(repest0)
        if not args.no_clean_seqs:
            remove_folder(repseq0)

    end_time = str(time.time() - start_time)
    logger.info("--- %s seconds ---", end_time)
    logger.info("Output dir: %s", OutDirName)