Exemplo n.º 1
0
def merge_alignment_stats(dirs, BASECALLER_RESULTS, ALIGNMENT_RESULTS, flows):
    
    datasets_json = {}
    try:
        f = open(os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json"),'r')
        datasets_json = json.load(f);
        f.close()
    except:
        printtime("ERROR: problem parsing %s" % os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json"))
        traceback.print_exc()
        return
    
    for dataset in datasets_json['datasets']:

        # What needs merging:
        #  - alignment.summary
        #  - alignTable.txt
        # Some time in the future:
        #  - alignStats_err.json

        # Merge alignStats metrics
        try:
            input_prefix_list = [os.path.join(dir,ALIGNMENT_RESULTS, dataset['file_prefix']+'.') for dir in dirs]
            input_prefix_list = [prefix for prefix in input_prefix_list if os.path.exists(prefix+'alignment.summary')]
            composite_prefix = os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.')
            if input_prefix_list:
                mergeAlignStatsResults(input_prefix_list,composite_prefix)
            else:
                printtime("Nothing to merge: "+dataset['file_prefix'])
        except:
            printtime("ERROR: merging %s stats unsuccessful" % (dataset['file_prefix']+'.bam'))
    

    datasets_basecaller = {}
    try:
        f = open(os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json"),'r')
        datasets_basecaller = json.load(f);
        f.close()
    except:
        printtime("ERROR: problem parsing %s" % os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json"))
        traceback.print_exc()
        return

    try:
        graph_max_x = int(50 * math.ceil(0.014 * int(flows)))
    except:
        graph_max_x = 400

    

    input_prefix_list = []

    for dataset in datasets_basecaller["datasets"]:
        printtime("Barcode processing, rename")
        src = os.path.join(ALIGNMENT_RESULTS,dataset['file_prefix']+'.alignment.summary')
        if os.path.exists(src):
            input_prefix_list.append(os.path.join(ALIGNMENT_RESULTS,dataset['file_prefix']+'.'))
            #terrible hack to make aggregate_alignment happy
            X_name = 'nomatch'
            read_group = dataset['read_groups'][0]
            if 'barcode_name' in datasets_basecaller['read_groups'][read_group]:
                X_name = datasets_basecaller['read_groups'][read_group]['barcode_name']
            dst = os.path.join(ALIGNMENT_RESULTS, 'alignment_%s.summary' % X_name)
            try:
                os.symlink(os.path.relpath(src,os.path.dirname(dst)),dst)
            except:
                printtime("ERROR: Unable to symlink '%s' to '%s'" % (src, dst))


    # Merge alignStats_err.json right here!

    merged_align_stats = {}
    align_stats_num_bases = 400
    for dir in dirs:
        current_align_stats = {}
        try:
            f = open(os.path.join(dir,ALIGNMENT_RESULTS,'alignStats_err.json'),'r')
            current_align_stats = json.load(f);
            f.close()
        except:
            printtime("Merge alignStats_err.json: skipping %s" % os.path.join(dir,ALIGNMENT_RESULTS,'alignStats_err.json'))
            continue
        
        if not merged_align_stats:
            merged_align_stats = current_align_stats
            align_stats_num_bases = len(merged_align_stats.get("read_length",[]))
            continue
        
        for idx in range(align_stats_num_bases):
            merged_align_stats['nread'][idx] += current_align_stats['nread'][idx]
            merged_align_stats['unaligned'][idx] += current_align_stats['unaligned'][idx]
            merged_align_stats['filtered'][idx] += current_align_stats['filtered'][idx]
            merged_align_stats['clipped'][idx] += current_align_stats['clipped'][idx]
            merged_align_stats['aligned'][idx] += current_align_stats['aligned'][idx]
            merged_align_stats['n_err_at_position'][idx] += current_align_stats['n_err_at_position'][idx]
            merged_align_stats['cum_aligned'][idx] += current_align_stats['cum_aligned'][idx]
            merged_align_stats['cum_err_at_position'][idx] += current_align_stats['cum_err_at_position'][idx]

        merged_align_stats['accuracy_total_bases'] += current_align_stats['accuracy_total_bases']
        merged_align_stats['accuracy_total_errors'] += current_align_stats['accuracy_total_errors']
        merged_align_stats['total_mapped_target_bases'] += current_align_stats['total_mapped_target_bases']
        merged_align_stats['total_mapped_reads'] += current_align_stats['total_mapped_reads']
            
        
    try:
        f = open(os.path.join(ALIGNMENT_RESULTS,'alignStats_err.json'),"w")
        json.dump(merged_align_stats, f, indent=4)
        f.close()
    except:
        printtime("ERROR; Failed to write merged alignStats_err.json")
        traceback.print_exc()
        
        
        
    mergeAlignStatsResults(input_prefix_list,ALIGNMENT_RESULTS+"/")

    try:
        base_error_plot.generate_base_error_plot(
            os.path.join(ALIGNMENT_RESULTS,'alignStats_err.json'),
            os.path.join(ALIGNMENT_RESULTS,'base_error_plot.png'),int(graph_max_x))
        base_error_plot.generate_alignment_rate_plot(
            os.path.join(ALIGNMENT_RESULTS,'alignStats_err.json'),
            os.path.join(BASECALLER_RESULTS,'readLen.txt'),
            os.path.join(ALIGNMENT_RESULTS,'alignment_rate_plot.png'),int(graph_max_x))

        
        printtime("Base error plot has been created successfully")
    except:
        printtime("ERROR: Failed to generate base error plot")
        traceback.print_exc()

    # Generate alignment_barcode_summary.csv
    barcodelist_path = 'barcodeList.txt'
    if not os.path.exists(barcodelist_path):
        barcodelist_path = '../barcodeList.txt'
    if not os.path.exists(barcodelist_path):
        barcodelist_path = '../../barcodeList.txt'
    if not os.path.exists(barcodelist_path):
        barcodelist_path = '../../../barcodeList.txt'
    if not os.path.exists(barcodelist_path):
        barcodelist_path = '../../../../barcodeList.txt'
    if os.path.exists(barcodelist_path):
        printtime("Barcode processing, aggregate")
        aggregate_alignment ("./",barcodelist_path)
Exemplo n.º 2
0
                 rl_in.close()
             else:
                 print("ERROR: skipped %s" % readlen_file)
         rl_out.close()
     except:
         traceback.print_exc()
         
     # Make plots for merged Report
     ALIGNMENT_RESULTS = '.'
     graph_max_x = round(max(l),-2) if round(max(l),-2) < 400 else 400
     try:
         base_error_plot.generate_base_error_plot(
             os.path.join(ALIGNMENT_RESULTS,'alignStats_err.json'),
             os.path.join(ALIGNMENT_RESULTS,'base_error_plot.png'),int(graph_max_x))
         base_error_plot.generate_alignment_rate_plot(
             os.path.join(ALIGNMENT_RESULTS,'alignStats_err.json'),
             os.path.join('.','readLen.txt'),
             os.path.join(ALIGNMENT_RESULTS,'alignment_rate_plot.png'),int(graph_max_x))            
         print("Base error plot has been created successfully")
     except:
         print("ERROR: Failed to generate base error plot")
         traceback.print_exc()        
     
      
 if args.zip and len(args.files) > 1: 
    # zip barcoded files
    zipname = args.zip
    print "Zip merged barcode files to %s" % zipname
    for filename in args.files:                      
      if os.path.exists(filename):
         try:
             make_zip(zipname, filename, arcname=filename)
Exemplo n.º 3
0
def alignment_post_processing(
        BASECALLER_RESULTS,
        ALIGNMENT_RESULTS,
        flows,
        mark_duplicates,
        force_alignstats):


    datasets_basecaller = {}
    try:
        f = open(os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json"),'r')
        datasets_basecaller = json.load(f);
        f.close()
    except:
        printtime("ERROR: problem parsing %s" % os.path.join(BASECALLER_RESULTS,"datasets_basecaller.json"))
        traceback.print_exc()
        return

    try:
        graph_max_x = int(50 * math.ceil(0.014 * int(flows)))
    except:
        graph_max_x = 400

    

    input_prefix_list = []

    for dataset in datasets_basecaller["datasets"]:
        if not os.path.exists(os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam'])):
            continue

        printtime("Barcode processing, rename")
        src = os.path.join(ALIGNMENT_RESULTS,dataset['file_prefix']+'.alignment.summary')
        if os.path.exists(src):
            input_prefix_list.append(os.path.join(ALIGNMENT_RESULTS,dataset['file_prefix']+'.'))
            #terrible hack to make aggregate_alignment happy
            X_name = 'nomatch'
            read_group = dataset['read_groups'][0]
            if 'barcode_name' in datasets_basecaller['read_groups'][read_group]:
                X_name = datasets_basecaller['read_groups'][read_group]['barcode_name']
            dst = os.path.join(ALIGNMENT_RESULTS, 'alignment_%s.summary' % X_name)
            try:
                os.symlink(os.path.relpath(src,os.path.dirname(dst)),dst)
            except:
                printtime("ERROR: Unable to symlink '%s' to '%s'" % (src, dst))

        printtime("Creating legacy name links")
        if dataset.has_key('legacy_prefix'):
            link_src = [
                os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.bam'),
                os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.bam.bai')]
            link_dst = [
                os.path.join(ALIGNMENT_RESULTS, os.path.basename(dataset['legacy_prefix'])+'.bam'),
                os.path.join(ALIGNMENT_RESULTS, os.path.basename(dataset['legacy_prefix'])+'.bam.bai')]
            for (src,dst) in zip(link_src,link_dst):
                try:
                    os.symlink(os.path.relpath(src,os.path.dirname(dst)),dst)
                except:
                    printtime("ERROR: Unable to symlink '%s' to '%s'" % (src, dst))

    # Special legacy post-processing.
    # Generate merged rawlib.basecaller.bam and rawlib.sff on barcoded runs

    composite_bam_filename = os.path.join(ALIGNMENT_RESULTS,'rawlib.bam')
    if not os.path.exists(composite_bam_filename):

        bam_file_list = []
        for dataset in datasets_basecaller["datasets"]:
            bam_name = os.path.join(ALIGNMENT_RESULTS,os.path.basename(dataset['file_prefix'])+'.bam')
            if os.path.exists(bam_name):
                bam_file_list.append(bam_name)

        blockprocessing.merge_bam_files(bam_file_list,composite_bam_filename,composite_bam_filename+'.bai',mark_duplicates)
        force_alignstats = True

    if force_alignstats:        
        ## Generate data for error plot for barcoded run from composite bam
        printtime("Call alignStats to generate raw accuracy")
        try:
            cmd = "alignStats"
            cmd += " -n 12"
            cmd += " --alignSummaryFile alignStats_err.txt"
            cmd += " --alignSummaryJsonFile alignStats_err.json"
            cmd += " --alignSummaryMinLen  1"
            #cmd += " --alignSummaryMaxLen  %s" % str(int(graph_max_x))
            cmd += " --alignSummaryMaxLen  %s" % str(int(400))
            cmd += " --alignSummaryLenStep 1"
            cmd += " --alignSummaryMaxErr  10"
            cmd += " --infile %s" % composite_bam_filename
            cmd = cmd + " --outputDir %s" % ALIGNMENT_RESULTS
            printtime("DEBUG: Calling '%s'" % cmd)
            os.system(cmd)
        except:
            printtime("alignStats failed")


    mergeAlignStatsResults(input_prefix_list,ALIGNMENT_RESULTS+"/")

    try:
        base_error_plot.generate_base_error_plot(
            os.path.join(ALIGNMENT_RESULTS,'alignStats_err.json'),
            os.path.join(ALIGNMENT_RESULTS,'base_error_plot.png'),int(graph_max_x))
        base_error_plot.generate_alignment_rate_plot(
            os.path.join(ALIGNMENT_RESULTS,'alignStats_err.json'),
            os.path.join(BASECALLER_RESULTS,'readLen.txt'),
            os.path.join(ALIGNMENT_RESULTS,'alignment_rate_plot.png'),int(graph_max_x))

        # Create aligned histogram plot
        
        # Create AQ20 plot
        
        printtime("Base error plot has been created successfully")
    except:
        printtime("ERROR: Failed to generate base error plot")
        traceback.print_exc()

    # Generate alignment_barcode_summary.csv
    barcodelist_path = 'barcodeList.txt'
    if not os.path.exists(barcodelist_path):
        barcodelist_path = '../barcodeList.txt'
    if not os.path.exists(barcodelist_path):
        barcodelist_path = '../../barcodeList.txt'
    if not os.path.exists(barcodelist_path):
        barcodelist_path = '../../../barcodeList.txt'
    if not os.path.exists(barcodelist_path):
        barcodelist_path = '../../../../barcodeList.txt'
    if os.path.exists(barcodelist_path):
        printtime("Barcode processing, aggregate")
        aggregate_alignment ("./",barcodelist_path)

    # These graphs are likely obsolete
    makeAlignGraphs()