Пример #1
0
def collectRptScout(DIR, tool):
    # FIRST: Get any applicable RM output stats
    for org in data_map.keys():
        for f in f_list:
            RS_job_file = DIR + "/../job_log/{prefix}.{org}.s0.f{f}".format(prefix=tool_prefix[tool], org=org, f=f)
            RM_job_file = DIR + "/../job_log/rm.{prefix}.{org}.s0.f{f}".format(prefix=tool_prefix[tool], org=org, f=f)
            RS_dir = DIR + "/" + ("{org}.s0.f{f}".format(org=org, f=f)).upper()
            RM_file = RS_dir + "/" + "{org}.fa.out".format(org=org, f=f)
            blast_file = RS_dir + "/" + "{org}.s0.f{f}.RS.blast.6.txt.bz2".format(org=org, f=f)
            pra_output = "{DIR}/{org}.s0.f{f}.pra.txt".format(DIR=RS_dir, org=org, f=f)

            tool_output = RM_file
            real_repeats = data_map[org] + ".out"

            H = create_stats_hash(tool, org, None, int(f))

            # Get stats from RM run
            try:
                Counts, Stats, Sets = perform_stats.perform_stats(real_repeats, tool_output, None)
                H["tp"], H["fp"], H["fn"], H["tn"] = Counts
                H["tpr"], H["tnr"], H["ppv"], H["npv"], H["fpr"], H["fdr"] = Stats
            except Exception as E:
                pass
                # raise E;

            # Get resource usage from RPT_SCOUT run
            if os.path.exists(RS_job_file):
                p = redhawk.loadPBS(open(RS_job_file, "rb"))[0]
                try:
                    if p.efile_exists():
                        H["ToolCpuTime"], H["ToolWallTime"], H["ToolMem"], H["ToolVMem"] = p.getResources()
                except:
                    pass
                redhawk.storePBS([p], open(RS_job_file, "wb"))

            # Get resource usage from RM run
            if os.path.exists(RM_job_file):
                p = redhawk.loadPBS(open(RM_job_file, "rb"))[0]
                try:
                    if p.efile_exists():
                        H["RMCpuTime"], H["RMWallTime"], H["RMMem"], H["RMVMem"] = p.getResources()
                except:
                    pass
                redhawk.storePBS([p], open(RM_job_file, "wb"))

            if os.path.exists(blast_file):
                cmd = "bzcat {blast_output} | ./pra_analysis2 {output}".format(
                    blast_output=blast_file, output=pra_output
                )
                subprocess.call(cmd, shell=True)
                query_cover, target_cover, Used = parse_pra_output.parse_pra_output(pra_output, "exclude.txt")
                H["ConCoverage"], H["QuCoverage"] = query_cover, target_cover

            stats_map[(tool, org, None, f)] = H
    return None
Пример #2
0
def collectRaider(DIR, tool):
    # FIRST: Get any applicable RM output stats
    for org in data_map.keys():
        for seed_num in seed_map.keys():
            for f in f_list:
                print("File: " + org + " " + str(seed_num) + " " + str(f) + "\n")
                RAIDER_job_file = DIR + "/../job_log/{prefix}.{org}.s{seed_num}.f{f}".format(
                    prefix=tool_prefix[tool], org=org, seed_num=seed_num, f=f
                )
                RM_job_file = DIR + "/../job_log/rm.{prefix}.{org}.s{seed_num}.f{f}".format(
                    prefix=tool_prefix[tool], org=org, seed_num=seed_num, f=f
                )
                RM_dir = DIR + "/" + ("{org}.s{seed}.f{f}".format(org=org, seed=seed_num, f=f)).upper()
                RM_file = RM_dir + "/" + "{org}.fa.out".format(org=org, seed=seed_num, f=f)
                blast_file = RM_dir + "/" + "{org}.s{seed}.f{f}.blast.6.txt.bz2".format(org=org, seed=seed_num, f=f)
                pra_output = RM_dir + "/" + "{org}.s{seed}.f{f}.pra.txt".format(org=org, seed=seed_num, f=f)

                tool_output = RM_file
                real_repeats = data_map[org] + ".out"

                H = create_stats_hash(tool, org, int(seed_num), int(f))

                seed = convert_seed(seed_map[seed_num])
                seed_len = len(seed)
                seed_weight = seed.count("1")
                seed_ratio = seed_weight / (float(seed_len))

                H["l"] = seed_len
                H["w"] = seed_weight
                H["w/l"] = seed_ratio

                # Get stats from RM run
                try:
                    Counts, Stats, Sets = perform_stats.perform_stats(real_repeats, tool_output, None)
                    H["tp"], H["fp"], H["fn"], H["tn"] = Counts
                    H["tpr"], H["tnr"], H["ppv"], H["npv"], H["fpr"], H["fdr"] = Stats
                except Exception as E:
                    pass
                    # raise E;

                # Get resource usage from RAIDER run
                if os.path.exists(RAIDER_job_file):
                    p = redhawk.loadPBS(open(RAIDER_job_file, "rb"))[0]
                    try:
                        if p.efile_exists():
                            H["ToolCpuTime"], H["ToolWallTime"], H["ToolMem"], H["ToolVMem"] = p.getResources()
                    except:
                        pass
                    redhawk.storePBS([p], open(RAIDER_job_file, "wb"))

                # Get resource usage from RM run
                if os.path.exists(RM_job_file):
                    p = redhawk.loadPBS(open(RM_job_file, "rb"))[0]
                    try:
                        if p.efile_exists():
                            H["RMCpuTime"], H["RMWallTime"], H["RMMem"], H["RMVMem"] = p.getResources()
                    except:
                        pass
                    redhawk.storePBS([p], open(RM_job_file, "wb"))

                if os.path.exists(blast_file):
                    if not os.path.exists(pra_output):
                        cmd = "bzcat {blast_output} | ./pra_analysis2 {output}".format(
                            blast_output=blast_file, output=pra_output
                        )
                        print("cmd: " + cmd)
                        # subprocess.call(cmd, shell=True)
                    query_cover, target_cover, Used = parse_pra_output.parse_pra_output(pra_output, "exclude.txt")
                    H["ConCoverage"], H["QuCoverage"] = query_cover, target_cover

                stats_map[(tool, org, seed_num, f)] = H
    return None
Пример #3
0
def collectRaider(DIR, tool):
    # FIRST: Get any applicable RM output stats
    for org in data_map.keys():
        for seed_num in seed_map.keys():
            for f in f_list:
                print("File: " + org + " " + str(seed_num) + " " + str(f))
                RAIDER_job_file = DIR + "/../job_log/{prefix}.{org}.s{seed_num}.f{f}".format(prefix = tool_prefix[tool], org=org, seed_num=seed_num, f=f)        
                RM_job_file = DIR + "/../job_log/rm.{prefix}.{org}.s{seed_num}.f{f}".format(prefix = tool_prefix[tool], org=org, seed_num=seed_num, f=f)
                RM_dir = DIR + "/" + ("{org}.s{seed}.f{f}".format(org=org, seed=seed_num, f=f)).upper()
                RM_file = RM_dir + "/" + "{org}.fa.out".format(org=org, seed=seed_num, f=f)
                blast_file = RM_dir + "/" + "{org}.s{seed}.f{f}.blast.6.txt.bz2".format(org=org, seed=seed_num, f=f)
                pra_output = RM_dir + "/" + "{org}.s{seed}.f{f}.pra.txt".format(org=org, seed=seed_num, f=f)

                tool_output = RM_file
                real_repeats = data_map[org] + ".out"

                H = create_stats_hash(tool, org, int(seed_num), int(f))

                seed = convert_seed(seed_map[seed_num])
                seed_len = len(seed)
                seed_weight = seed.count("1")
                seed_ratio = seed_weight / (float(seed_len))


                H['l'] = seed_len
                H['w'] = seed_weight
                H['w/l'] = seed_ratio


                # Get stats from RM run
                try:
                    negatives, fp, fp_d, positives, tp, famHash = rm_analysis.collect_stats(real_repeats, tool_output, fp_dist)                   
                    H['tp'] = tp
                    H['fp'] = fp                # DEBUG: NEED TO DOUBLE-CHECK THIS!!!
                    H['tn'] = negatives - H['fp']
                    H['fn'] = positives - H['tp']
                    H['tpr'] = H['tp'] / positives
                    H['tnr'] = H['tn'] / negatives
                    H['ppv'] = H['tp'] / (H['tp'] + H['fp'])
                    H['npv'] = H['tn'] / (H['tn'] + H['fn'])
                    H['fpr'] = H['fp'] / negatives
                    H['fnr'] = 1 - H['tpr']
                    H['dfr'] = 1 - H['ppv']
                    
                    #Counts, Stats, Sets = perform_stats.perform_stats(real_repeats, tool_output, None)
                    #H['tp'], H['fp'], H['fn'], H['tn'] = Counts
                    #H['tpr'], H['tnr'], H['ppv'], H['npv'], H['fpr'], H['fdr']  = Stats
                except Exception as E:
                    #raise E;
                    pass


                # Get resource usage from RAIDER run
                if os.path.exists(RAIDER_job_file):
                    p = redhawk.loadPBS(open(RAIDER_job_file, "rb"))[0]
                    try:
                        if p.efile_exists():
                            H['ToolCpuTime'], H['ToolWallTime'], H['ToolMem'], H['ToolVMem'] = p.getResources()
                    except:
                        pass
                    redhawk.storePBS([p], open(RAIDER_job_file, "wb"))

                # Get resource usage from RM run
                if os.path.exists(RM_job_file):
                    p = redhawk.loadPBS(open(RM_job_file, "rb"))[0]
                    try:
                        if p.efile_exists():
                            H['RMCpuTime'], H['RMWallTime'], H['RMMem'], H['RMVMem'] = p.getResources()
                    except:
                        pass
                    redhawk.storePBS([p], open(RM_job_file, "wb"))

                
                #print("BF: " + blast_file)
                #print("PRA: " + pra_output) 
                if os.path.exists(blast_file):
                    if not os.path.exists(pra_output):
                        cmd = "bzcat {blast_output} | ./pra_analysis2 {output}".format(blast_output=blast_file, output=pra_output)
                        #print("cmd: " + cmd)
                        subprocess.call(cmd, shell=True)
                    query_cover, target_cover, Used = parse_pra_output.parse_pra_output(pra_output, "exclude.txt")
                    H['ConCoverage'], H['QuCoverage'] = query_cover, target_cover


                stats_map[(tool,org,seed_num,f)] = H
    return None
Пример #4
0
def collectRptScout(DIR, tool):
    # FIRST: Get any applicable RM output stats
    for org in data_map.keys():
        for f in f_list:
            RS_job_file = DIR + "/../job_log/{prefix}.{org}.s0.f{f}".format(prefix = tool_prefix[tool], org=org, f=f)        
            RM_job_file = DIR + "/../job_log/rm.{prefix}.{org}.s0.f{f}".format(prefix = tool_prefix[tool], org=org, f=f)
            RS_dir = DIR + "/" + ("{org}.s0.f{f}".format(org=org, f=f)).upper()
            RM_file = RS_dir + "/" + "{org}.fa.out".format(org=org, f=f)
            blast_file = RS_dir + "/" + "{org}.s0.f{f}.RS.blast.6.txt.bz2".format(org=org, f=f)
            pra_output = "{DIR}/{org}.s0.f{f}.pra.txt".format(DIR=RS_dir, org=org, f=f)

            tool_output = RM_file
            real_repeats = data_map[org] + ".out"

            H = create_stats_hash(tool, org, None, int(f))


            # Get stats from RM run
            try:
                negatives, fp, fp_d, positives, tp, famHash = rm_analysis.collect_stats(real_repeats, tool_output, fp_dist)                   
                H['tp'] = tp
                H['fp'] = fp                # DEBUG: NEED TO DOUBLE-CHECK THIS!!!
                H['tn'] = negatives - H['fp']
                H['fn'] = positives - H['tp']
                H['tpr'] = H['tp'] / positives
                #print(H['tpr'])
                H['tnr'] = H['tn'] / negatives
                H['ppv'] = H['tp'] / (H['tp'] + H['fp'])
                H['npv'] = H['tn'] / (H['tn'] + H['fn'])
                H['fpr'] = H['fp'] / negatives
                H['fnr'] = 1 - H['tpr']
                H['dfr'] = 1 - H['ppv']

                #Counts, Stats, Sets = perform_stats.perform_stats(real_repeats, tool_output, None)
                #H['tp'], H['fp'], H['fn'], H['tn'] = Counts
                #H['tpr'], H['tnr'], H['ppv'], H['npv'], H['fpr'], H['fdr']  = Stats
            except Exception as E:
                pass
                #raise E;


            # Get resource usage from RPT_SCOUT run
            if os.path.exists(RS_job_file):
                p = redhawk.loadPBS(open(RS_job_file, "rb"))[0]
                try:
                    if p.efile_exists():
                        H['ToolCpuTime'], H['ToolWallTime'], H['ToolMem'], H['ToolVMem'] = p.getResources()
                except:
                    pass
                redhawk.storePBS([p], open(RS_job_file, "wb"))

            # Get resource usage from RM run
            if os.path.exists(RM_job_file):
                p = redhawk.loadPBS(open(RM_job_file, "rb"))[0]
                try:
                    if p.efile_exists():
                        H['RMCpuTime'], H['RMWallTime'], H['RMMem'], H['RMVMem'] = p.getResources()
                except:
                    pass
                redhawk.storePBS([p], open(RM_job_file, "wb"))

            if os.path.exists(blast_file):
                cmd = "bzcat {blast_output} | ./pra_analysis2 {output}".format(blast_output=blast_file, output=pra_output)
                subprocess.call(cmd, shell=True)
                query_cover, target_cover, Used = parse_pra_output.parse_pra_output(pra_output, "exclude.txt")
                H['ConCoverage'], H['QuCoverage'] = query_cover, target_cover


            stats_map[(tool,org,None,f)] = H
    return None
Пример #5
0
def collectRptScout(DIR, tool):
    # FIRST: Get any applicable RM output stats
    for org in data_map.keys():
        for f in f_list:
            RS_job_file = DIR + "/../job_log/{prefix}.{org}.s0.f{f}".format(
                prefix=tool_prefix[tool], org=org, f=f)
            RM_job_file = DIR + "/../job_log/rm.{prefix}.{org}.s0.f{f}".format(
                prefix=tool_prefix[tool], org=org, f=f)
            RS_dir = DIR + "/" + ("{org}.s0.f{f}".format(org=org, f=f)).upper()
            RM_file = RS_dir + "/" + "{org}.fa.out".format(org=org, f=f)
            blast_file = RS_dir + "/" + "{org}.s0.f{f}.RS.blast.6.txt.bz2".format(
                org=org, f=f)
            pra_output = "{DIR}/{org}.s0.f{f}.pra.txt".format(DIR=RS_dir,
                                                              org=org,
                                                              f=f)

            tool_output = RM_file
            real_repeats = data_map[org] + ".out"

            H = create_stats_hash(tool, org, None, int(f))

            # Get stats from RM run
            try:
                negatives, fp, fp_d, positives, tp, famHash = rm_analysis.collect_stats(
                    real_repeats, tool_output, fp_dist)
                H['tp'] = tp
                H['fp'] = fp  # DEBUG: NEED TO DOUBLE-CHECK THIS!!!
                H['tn'] = negatives - H['fp']
                H['fn'] = positives - H['tp']
                H['tpr'] = H['tp'] / positives
                #print(H['tpr'])
                H['tnr'] = H['tn'] / negatives
                H['ppv'] = H['tp'] / (H['tp'] + H['fp'])
                H['npv'] = H['tn'] / (H['tn'] + H['fn'])
                H['fpr'] = H['fp'] / negatives
                H['fnr'] = 1 - H['tpr']
                H['dfr'] = 1 - H['ppv']

                #Counts, Stats, Sets = perform_stats.perform_stats(real_repeats, tool_output, None)
                #H['tp'], H['fp'], H['fn'], H['tn'] = Counts
                #H['tpr'], H['tnr'], H['ppv'], H['npv'], H['fpr'], H['fdr']  = Stats
            except Exception as E:
                pass
                #raise E;

            # Get resource usage from RPT_SCOUT run
            if os.path.exists(RS_job_file):
                p = redhawk.loadPBS(open(RS_job_file, "rb"))[0]
                try:
                    if p.efile_exists():
                        H['ToolCpuTime'], H['ToolWallTime'], H['ToolMem'], H[
                            'ToolVMem'] = p.getResources()
                except:
                    pass
                redhawk.storePBS([p], open(RS_job_file, "wb"))

            # Get resource usage from RM run
            if os.path.exists(RM_job_file):
                p = redhawk.loadPBS(open(RM_job_file, "rb"))[0]
                try:
                    if p.efile_exists():
                        H['RMCpuTime'], H['RMWallTime'], H['RMMem'], H[
                            'RMVMem'] = p.getResources()
                except:
                    pass
                redhawk.storePBS([p], open(RM_job_file, "wb"))

            if os.path.exists(blast_file):
                cmd = "bzcat {blast_output} | ./pra_analysis2 {output}".format(
                    blast_output=blast_file, output=pra_output)
                subprocess.call(cmd, shell=True)
                query_cover, target_cover, Used = parse_pra_output.parse_pra_output(
                    pra_output, "exclude.txt")
                H['ConCoverage'], H['QuCoverage'] = query_cover, target_cover

            stats_map[(tool, org, None, f)] = H
    return None
Пример #6
0
def collectNaive(DIR, tool):  # NAIVE
    # FIRST: Get any applicable RM output stats
    assert (tool == 'naive')
    for org in data_map.keys():
        for seed_num in seed_map.keys():
            for f in f_list:
                print("File: " + org + " " + str(seed_num) + " " + str(f))
                NAIVE_job_file = DIR + "/../job_log/{prefix}.{org}.s{seed_num}.f{f}".format(
                    prefix=tool_prefix[tool], org=org, seed_num=seed_num, f=f)
                RM_job_file = DIR + "/../job_log/rm.{prefix}.{org}.s{seed_num}.f{f}".format(
                    prefix=tool_prefix[tool], org=org, seed_num=seed_num, f=f)
                RM_dir = DIR + "/" + ("{org}.s{seed}.f{f}".format(
                    org=org, seed=seed_num, f=f)).upper()
                RM_file = RM_dir + "/" + "{org}.fa.out".format(
                    org=org, seed=seed_num, f=f)
                blast_file = RM_dir + "/" + "{org}.s{seed}.f{f}.blast.6.txt.bz2".format(
                    org=org, seed=seed_num, f=f)
                pra_output = RM_dir + "/" + "{org}.s{seed}.f{f}.pra.txt".format(
                    org=org, seed=seed_num, f=f)

                tool_output = RM_file
                real_repeats = data_map[org] + ".out"

                H = create_stats_hash(tool, org, int(seed_num), int(f))

                seed = convert_seed(seed_map[seed_num])
                seed_len = len(seed)
                seed_weight = seed.count("1")
                seed_ratio = seed_weight / (float(seed_len))

                H['l'] = seed_len
                H['w'] = seed_weight
                H['w/l'] = seed_ratio

                # Get stats from RM run
                try:
                    negatives, fp, fp_d, positives, tp, famHash = rm_analysis.collect_stats(
                        real_repeats, tool_output, fp_dist)
                    H['tp'] = tp
                    H['fp'] = fp  # DEBUG: NEED TO DOUBLE-CHECK THIS!!!
                    H['tn'] = negatives - H['fp']
                    H['fn'] = positives - H['tp']
                    H['tpr'] = H['tp'] / positives
                    #print(H['tpr'])
                    H['tnr'] = H['tn'] / negatives
                    H['ppv'] = H['tp'] / (H['tp'] + H['fp'])
                    H['npv'] = H['tn'] / (H['tn'] + H['fn'])
                    H['fpr'] = H['fp'] / negatives
                    H['fnr'] = 1 - H['tpr']
                    H['dfr'] = 1 - H['ppv']

                    #Counts, Stats, Sets = perform_stats.perform_stats(real_repeats, tool_output, None)
                    #H['tp'], H['fp'], H['fn'], H['tn'] = Counts
                    #H['tpr'], H['tnr'], H['ppv'], H['npv'], H['fpr'], H['fdr']  = Stats
                except Exception as E:
                    pass
                    #raise E;

                # Get resource usage from NAIVE run
                if os.path.exists(NAIVE_job_file):
                    p = redhawk.loadPBS(open(NAIVE_job_file, "rb"))[0]
                    try:
                        if p.efile_exists():
                            H['ToolCpuTime'], H['ToolWallTime'], H[
                                'ToolMem'], H['ToolVMem'] = p.getResources()
                    except:
                        pass
                    redhawk.storePBS([p], open(NAIVE_job_file, "wb"))

                # Get resource usage from RM run
                if os.path.exists(RM_job_file):
                    p = redhawk.loadPBS(open(RM_job_file, "rb"))[0]
                    try:
                        if p.efile_exists():
                            H['RMCpuTime'], H['RMWallTime'], H['RMMem'], H[
                                'RMVMem'] = p.getResources()
                    except:
                        pass
                    redhawk.storePBS([p], open(RM_job_file, "wb"))

                if os.path.exists(blast_file):
                    if not os.path.exists(pra_output):
                        cmd = "bzcat {blast_output} | ./pra_analysis2 {output}".format(
                            blast_output=blast_file, output=pra_output)
                        #print("cmd: " + cmd)
                        subprocess.call(cmd, shell=True)
                    query_cover, target_cover, Used = parse_pra_output.parse_pra_output(
                        pra_output, "exclude.txt")
                    H['ConCoverage'], H[
                        'QuCoverage'] = query_cover, target_cover

                stats_map[(tool, org, seed_num, f)] = H
    return None