Пример #1
0
def process_datasets(
    blocks,
    alignmentArgs,
    ionstatsArgs,
    BASECALLER_RESULTS,
    basecaller_meta_information,
    library_key,
    graph_max_x,
    basecaller_datasets,
    ALIGNMENT_RESULTS,
    do_realign,
    do_ionstats,
    do_mark_duplicates,
    do_indexing,
    barcodeInfo,
):

    parallel_datasets = 1
    memTotalGb = _get_total_memory_gb()
    try:
        if memTotalGb > 140:
            parallel_datasets = 4
        elif memTotalGb >= 70:
            parallel_datasets = 2
    except Exception:
        pass

    align_threads = multiprocessing.cpu_count() / parallel_datasets
    if memTotalGb <= 40:
        # reduce number of CPU (1 vCPU = 2 cores)
        align_threads = align_threads - 2
    printtime("Attempt to align")
    printtime(
        "DEBUG: PROCESS DATASETS blocks: '%s', parallel datasets: %d"
        % (blocks, parallel_datasets)
    )

    # TODO: compare with pipeline/python/ion/utils/ionstats.py
    ionstats_basecaller_file_list = []
    ionstats_alignment_file_list = []
    ionstats_basecaller_filtered_file_list = []
    ionstats_alignment_filtered_file_list = []

    align_dataset_args = []

    for dataset in basecaller_datasets["datasets"]:

        read_group = dataset["read_groups"][0]
        reference = basecaller_datasets["read_groups"][read_group]["reference"]
        # print "DEBUG: reference: %s' % reference

        filtered = True
        for rg_name in dataset["read_groups"]:
            if not basecaller_datasets["read_groups"][rg_name].get("filtered", False):
                filtered = False

        # skip non-existing bam file
        if int(dataset["read_count"]) == 0:
            continue

        align_dataset_args.append(
            (
                dataset,
                blocks,
                reference,
                alignmentArgs,
                ionstatsArgs,
                BASECALLER_RESULTS,
                basecaller_meta_information,
                library_key,
                graph_max_x,
                ALIGNMENT_RESULTS,
                do_realign,
                do_ionstats,
                do_mark_duplicates,
                do_indexing,
                align_threads,
                barcodeInfo,
            )
        )

        if reference:
            if filtered:
                ionstats_alignment_filtered_file_list.append(
                    os.path.join(
                        ALIGNMENT_RESULTS,
                        dataset["file_prefix"] + ".ionstats_alignment.json",
                    )
                )
            else:
                ionstats_alignment_file_list.append(
                    os.path.join(
                        ALIGNMENT_RESULTS,
                        dataset["file_prefix"] + ".ionstats_alignment.json",
                    )
                )
        else:
            if filtered:
                ionstats_basecaller_filtered_file_list.append(
                    os.path.join(
                        BASECALLER_RESULTS,
                        dataset["file_prefix"] + ".ionstats_basecaller.json",
                    )
                )
            else:
                ionstats_basecaller_file_list.append(
                    os.path.join(
                        BASECALLER_RESULTS,
                        dataset["file_prefix"] + ".ionstats_basecaller.json",
                    )
                )

    # do alignment in multiprocessing pool
    pool = multiprocessing.Pool(processes=parallel_datasets)
    pool.map(align_dataset_parallel_wrap, align_dataset_args)

    if do_ionstats:

        # Merge ionstats files from individual (barcoded) datasets
        if len(ionstats_alignment_file_list) > 0:
            ionstats.reduce_stats(
                ionstats_alignment_file_list,
                os.path.join(ALIGNMENT_RESULTS, "ionstats_alignment.json"),
            )
        else:  # barcode classification filtered all barcodes or no reads available
            # TODO: ionstats needs to produce initial json file
            try:
                # cmd = "echo $'@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam"
                cmd = "echo  '@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam"

                printtime("DEBUG: Calling '%s':" % cmd)
                ret = subprocess.call(cmd, shell=True)
                if ret != 0:
                    printtime(
                        "ERROR: empty bam file generation failed, return code: %d" % ret
                    )
                    raise RuntimeError("exit code: %d" % ret)

                ionstats.generate_ionstats_alignment(
                    ionstatsArgs,
                    ["empty_dummy.bam"],
                    os.path.join(ALIGNMENT_RESULTS, "ionstats_alignment.json"),
                    os.path.join(ALIGNMENT_RESULTS, "ionstats_error_summary.h5"),
                    basecaller_meta_information,
                    library_key,
                    graph_max_x,
                )

            except Exception:
                raise

        if len(ionstats_basecaller_file_list) > 0:
            ionstats.reduce_stats(
                ionstats_basecaller_file_list,
                os.path.join(BASECALLER_RESULTS, "ionstats_tmp_basecaller.json"),
            )
        else:  # barcode classification filtered all barcodes or no reads available
            # TODO: ionstats needs to produce initial json file
            try:
                # cmd = "echo $'@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam"
                cmd = "echo  '@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam"

                printtime("DEBUG: Calling '%s':" % cmd)
                ret = subprocess.call(cmd, shell=True)
                if ret != 0:
                    printtime(
                        "ERROR: empty bam file generation failed, return code: %d" % ret
                    )
                    raise RuntimeError("exit code: %d" % ret)

                ionstats.generate_ionstats_basecaller(
                    ["empty_dummy.bam"],
                    os.path.join(BASECALLER_RESULTS, "ionstats_tmp_basecaller.json"),
                    library_key,
                    graph_max_x,
                )
            except Exception:
                raise

        ionstatslist = []
        a = os.path.join(ALIGNMENT_RESULTS, "ionstats_alignment.json")
        b = os.path.join(BASECALLER_RESULTS, "ionstats_tmp_basecaller.json")
        if os.path.exists(a):
            ionstatslist.append(a)
        if os.path.exists(b):
            ionstatslist.append(b)
        if len(ionstatslist) > 0:
            ionstats.reduce_stats(
                ionstatslist,
                os.path.join(
                    BASECALLER_RESULTS, "ionstats_basecaller_with_aligninfos.json"
                ),
            )
            ionstats.reduce_stats(
                reversed(ionstatslist),
                os.path.join(BASECALLER_RESULTS, "ionstats_basecaller.json"),
            )
    #        if len(ionstats_alignment_h5_file_list) > 0:
    #            ionstats.reduce_stats_h5(ionstats_alignment_h5_file_list,os.path.join(ALIGNMENT_RESULTS,'ionstats_error_summary.h5'))

    printtime("**** Alignment completed ****")
Пример #2
0
def process_datasets(
        blocks,
        alignmentArgs,
        ionstatsArgs,
        BASECALLER_RESULTS,
        basecaller_meta_information,
        library_key,
        graph_max_x,
        basecaller_datasets,
        ALIGNMENT_RESULTS,
        do_realign,
        do_ionstats,
        do_mark_duplicates,
        do_indexing,
        barcodeInfo):

    parallel_datasets = 1
    try:
        memTotalGb = os.sysconf('SC_PAGE_SIZE')*os.sysconf('SC_PHYS_PAGES')/(1024*1024*1024)
        if memTotalGb > 70:
            parallel_datasets = 2
    except:
        pass

    align_threads = multiprocessing.cpu_count() / parallel_datasets
    printtime("Attempt to align")
    printtime("DEBUG: PROCESS DATASETS blocks: '%s', parallel datasets: %d" % (blocks, parallel_datasets))

    # TODO: compare with pipeline/python/ion/utils/ionstats.py
    ionstats_basecaller_file_list = []
    ionstats_alignment_file_list = []
    ionstats_basecaller_filtered_file_list = []
    ionstats_alignment_filtered_file_list = []
    
    align_dataset_args = []

    for dataset in basecaller_datasets["datasets"]:

        read_group = dataset['read_groups'][0]
        reference = basecaller_datasets['read_groups'][read_group]['reference']
        # print "DEBUG: reference: %s' % reference

        filtered = True
        for rg_name in dataset["read_groups"]:
            if not basecaller_datasets["read_groups"][rg_name].get('filtered', False):
                filtered = False

        # skip non-existing bam file
        if int(dataset["read_count"]) == 0:
            continue

        align_dataset_args.append((
            dataset,
            blocks,
            reference,
            alignmentArgs,
            ionstatsArgs,
            BASECALLER_RESULTS,
            basecaller_meta_information,
            library_key,
            graph_max_x,
            ALIGNMENT_RESULTS,
            do_realign,
            do_ionstats,
            do_mark_duplicates,
            do_indexing,
            align_threads
        ))

        if reference:
            if filtered:
                ionstats_alignment_filtered_file_list.append(os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_alignment.json'))
            else:
                ionstats_alignment_file_list.append(os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_alignment.json'))
        else:
            if filtered:
                ionstats_basecaller_filtered_file_list.append(os.path.join(BASECALLER_RESULTS, dataset['file_prefix']+'.ionstats_basecaller.json'))
            else:
                ionstats_basecaller_file_list.append(os.path.join(BASECALLER_RESULTS, dataset['file_prefix']+'.ionstats_basecaller.json'))

    # do alignment in multiprocessing pool
    pool = multiprocessing.Pool(processes=parallel_datasets)
    pool.map(align_dataset_parallel_wrap, align_dataset_args)


    if do_ionstats:

        # Merge ionstats files from individual (barcoded) datasets
        if len(ionstats_alignment_file_list) > 0:
            ionstats.reduce_stats(ionstats_alignment_file_list, os.path.join(ALIGNMENT_RESULTS, 'ionstats_alignment.json'))
        else:  # barcode classification filtered all barcodes or no reads available
            # TODO: ionstats needs to produce initial json file
            try:
                # cmd = "echo $'@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam"
                cmd = "echo  '@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam"

                printtime("DEBUG: Calling '%s':" % cmd)
                ret = subprocess.call(cmd, shell=True)
                if ret != 0:
                    printtime("ERROR: empty bam file generation failed, return code: %d" % ret)
                    raise RuntimeError('exit code: %d' % ret)

                ionstats.generate_ionstats_alignment(
                    ionstatsArgs,
                    ['empty_dummy.bam'],
                    os.path.join(ALIGNMENT_RESULTS, 'ionstats_alignment.json'),
                    os.path.join(ALIGNMENT_RESULTS, 'ionstats_error_summary.h5'),
                    basecaller_meta_information,
                    library_key,
                    graph_max_x)

            except:
                raise

        if len(ionstats_basecaller_file_list) > 0:
            ionstats.reduce_stats(ionstats_basecaller_file_list, os.path.join(BASECALLER_RESULTS, 'ionstats_tmp_basecaller.json'))
        else:  # barcode classification filtered all barcodes or no reads available
            # TODO: ionstats needs to produce initial json file
            try:
                # cmd = "echo $'@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam"
                cmd = "echo  '@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam"

                printtime("DEBUG: Calling '%s':" % cmd)
                ret = subprocess.call(cmd, shell=True)
                if ret != 0:
                    printtime("ERROR: empty bam file generation failed, return code: %d" % ret)
                    raise RuntimeError('exit code: %d' % ret)

                ionstats.generate_ionstats_basecaller(
                    ['empty_dummy.bam'],
                    os.path.join(BASECALLER_RESULTS, 'ionstats_tmp_basecaller.json'),
                    library_key,
                    graph_max_x)
            except:
                raise

        ionstatslist = []
        a = os.path.join(ALIGNMENT_RESULTS, 'ionstats_alignment.json')
        b = os.path.join(BASECALLER_RESULTS, 'ionstats_tmp_basecaller.json')
        if os.path.exists(a):
            ionstatslist.append(a)
        if os.path.exists(b):
            ionstatslist.append(b)
        if len(ionstatslist) > 0:
            ionstats.reduce_stats(ionstatslist, os.path.join(BASECALLER_RESULTS, 'ionstats_basecaller_with_aligninfos.json'))
            ionstats.reduce_stats(reversed(ionstatslist), os.path.join(BASECALLER_RESULTS, 'ionstats_basecaller.json'))
#        if len(ionstats_alignment_h5_file_list) > 0:
#            ionstats.reduce_stats_h5(ionstats_alignment_h5_file_list,os.path.join(ALIGNMENT_RESULTS,'ionstats_error_summary.h5'))

    printtime("**** Alignment completed ****")
Пример #3
0
def process_datasets(blocks, alignmentArgs, ionstatsArgs, BASECALLER_RESULTS,
                     basecaller_meta_information, library_key, graph_max_x,
                     basecaller_datasets, ALIGNMENT_RESULTS, do_realign,
                     do_ionstats, do_mark_duplicates, do_indexing,
                     barcodeInfo):

    printtime("Attempt to align")
    printtime("DEBUG: PROCESS DATASETS blocks: '%s'" % blocks)

    do_sorting = True

    # TODO: compare with pipeline/python/ion/utils/ionstats.py
    ionstats_basecaller_file_list = []
    ionstats_alignment_file_list = []
    ionstats_basecaller_filtered_file_list = []
    ionstats_alignment_filtered_file_list = []

    for dataset in basecaller_datasets["datasets"]:

        read_group = dataset['read_groups'][0]
        reference = basecaller_datasets['read_groups'][read_group]['reference']
        #print "DEBUG: reference: %s' % reference

        filtered = True
        for rg_name in dataset["read_groups"]:
            if not basecaller_datasets["read_groups"][rg_name].get(
                    'filtered', False):
                filtered = False

        # skip non-existing bam file
        if int(dataset["read_count"]) == 0:
            continue

        try:

            # process block by block
            if reference and len(blocks) > 1 and int(
                    dataset["read_count"]) > 20000000:
                printtime(
                    "DEBUG: TRADITIONAL BLOCK PROCESSING ------ prefix: %20s ----------- reference: %20s ---------- reads: %10s ----------"
                    %
                    (dataset['file_prefix'], reference, dataset["read_count"]))
                # start alignment for each block and current barcode with reads
                # TODO: in how many blocks are reads with this barcode
                for block in blocks:
                    printtime("DEBUG: ALIGN ONLY ONE BLOCK: %s" % block)
                    align([block],
                          os.path.join(BASECALLER_RESULTS,
                                       dataset['basecaller_bam']),
                          alignmentArgs,
                          ionstatsArgs,
                          reference,
                          basecaller_meta_information,
                          library_key,
                          graph_max_x,
                          do_realign,
                          do_ionstats=False,
                          do_sorting=do_sorting,
                          do_mark_duplicates=False,
                          do_indexing=False,
                          output_dir=os.path.join(block, ALIGNMENT_RESULTS),
                          output_basename=dataset['file_prefix'])

                bamdir = '.'  # TODO , do we need this ?
                bamBase = dataset['file_prefix']
                bamfile = dataset['file_prefix'] + ".bam"

                #                printtime("DEBUG: BLOCKS for BAMFILE %s: %s" % (bamfile, blocks))
                block_bam_list = [
                    os.path.join(blockdir, bamdir, bamfile)
                    for blockdir in blocks
                ]
                #                printtime("DEBUG: block_bam_list: %s" % block_bam_list)
                block_bam_list = [
                    block_bam_filename for block_bam_filename in block_bam_list
                    if os.path.exists(block_bam_filename)
                ]
                #                printtime("DEBUG: block_bam_list: %s" % block_bam_list)
                printtime("blocks with reads:    %s" % len(block_bam_list))

                bamFile = dataset['file_prefix'] + ".bam"
                composite_bam_filepath = dataset['file_prefix'] + ".bam"

                blockprocessing.extract_and_merge_bam_header(
                    block_bam_list, composite_bam_filepath)
                # Usage: samtools merge [-nr] [-h inh.sam] <out.bam> <in1.bam> <in2.bam> [...]
                cmd = 'samtools merge -l1 -@8'
                if do_ionstats:
                    cmd += ' - '
                else:
                    cmd += ' %s' % (composite_bam_filepath)
                for bamfile in block_bam_list:
                    cmd += ' %s' % bamfile
                cmd += ' -h %s.header.sam' % composite_bam_filepath

                if do_ionstats:
                    bam_filenames = ["/dev/stdin"]
                    ionstats_alignment_filename = "%s.ionstats_alignment.json" % bamBase  # os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_alignment.json')
                    ionstats_alignment_h5_filename = "%s.ionstats_error_summary.h5" % bamBase  # os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_error_summary.h5')

                    ionstats_cmd = ionstats.generate_ionstats_alignment_cmd(
                        ionstatsArgs, bam_filenames,
                        ionstats_alignment_filename,
                        ionstats_alignment_h5_filename,
                        basecaller_meta_information, library_key, graph_max_x)

                    cmd += " | tee >(%s)" % ionstats_cmd

                if do_mark_duplicates:
                    json_name = 'BamDuplicates.%s.json' % bamBase if bamBase != 'rawlib' else 'BamDuplicates.json'
                    cmd = "BamDuplicates -i <(%s) -o %s -j %s" % (cmd, bamFile,
                                                                  json_name)
                else:
                    cmd += " > %s.bam" % bamBase

                printtime("DEBUG: Calling '%s':" % cmd)
                ret = subprocess.Popen(['/bin/bash', '-c', cmd]).wait()
                if ret != 0:
                    printtime("ERROR: merging failed, return code: %d" % ret)
                    raise RuntimeError('exit code: %d' % ret)

                # TODO: piping into samtools index or create index in sort process ?
                if do_indexing and do_sorting:
                    cmd = "samtools index " + bamFile
                    printtime("DEBUG: Calling '%s':" % cmd)
                    subprocess.call(cmd, shell=True)

            else:
                printtime(
                    "DEBUG: MERGED BLOCK PROCESSING ----------- prefix: %20s ----------- reference: %20s ---------- reads: %10s ----------"
                    %
                    (dataset['file_prefix'], reference, dataset["read_count"]))
                # TODO: try a python multiprocessing pool
                align(blocks,
                      os.path.join(BASECALLER_RESULTS,
                                   dataset['basecaller_bam']),
                      alignmentArgs,
                      ionstatsArgs,
                      reference,
                      basecaller_meta_information,
                      library_key,
                      graph_max_x,
                      do_realign,
                      do_ionstats,
                      do_sorting,
                      do_mark_duplicates,
                      do_indexing,
                      output_dir=ALIGNMENT_RESULTS
                      if reference else BASECALLER_RESULTS,
                      output_basename=dataset['file_prefix'])
        except:
            traceback.print_exc()

        if reference:
            if filtered:
                ionstats_alignment_filtered_file_list.append(
                    os.path.join(
                        ALIGNMENT_RESULTS,
                        dataset['file_prefix'] + '.ionstats_alignment.json'))
            else:
                ionstats_alignment_file_list.append(
                    os.path.join(
                        ALIGNMENT_RESULTS,
                        dataset['file_prefix'] + '.ionstats_alignment.json'))
        else:
            if filtered:
                ionstats_basecaller_filtered_file_list.append(
                    os.path.join(
                        BASECALLER_RESULTS,
                        dataset['file_prefix'] + '.ionstats_basecaller.json'))
            else:
                ionstats_basecaller_file_list.append(
                    os.path.join(
                        BASECALLER_RESULTS,
                        dataset['file_prefix'] + '.ionstats_basecaller.json'))

    if do_ionstats:

        # Merge ionstats files from individual (barcoded) datasets
        if len(ionstats_alignment_file_list) > 0:
            ionstats.reduce_stats(
                ionstats_alignment_file_list,
                os.path.join(ALIGNMENT_RESULTS, 'ionstats_alignment.json'))
        else:  # barcode classification filtered all barcodes or no reads available
            # TODO: ionstats needs to produce initial json file
            try:
                #cmd = "echo $'@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam"
                cmd = "echo  '@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam"

                printtime("DEBUG: Calling '%s':" % cmd)
                ret = subprocess.call(cmd, shell=True)
                if ret != 0:
                    printtime(
                        "ERROR: empty bam file generation failed, return code: %d"
                        % ret)
                    raise RuntimeError('exit code: %d' % ret)

                ionstats.generate_ionstats_alignment(
                    ionstatsArgs, ['empty_dummy.bam'],
                    os.path.join(ALIGNMENT_RESULTS, 'ionstats_alignment.json'),
                    os.path.join(ALIGNMENT_RESULTS,
                                 'ionstats_error_summary.h5'),
                    basecaller_meta_information, library_key, graph_max_x)

            except:
                raise

        if len(ionstats_basecaller_file_list) > 0:
            ionstats.reduce_stats(
                ionstats_basecaller_file_list,
                os.path.join(BASECALLER_RESULTS,
                             'ionstats_tmp_basecaller.json'))
        else:  # barcode classification filtered all barcodes or no reads available
            # TODO: ionstats needs to produce initial json file
            try:
                #cmd = "echo $'@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam"
                cmd = "echo  '@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam"

                printtime("DEBUG: Calling '%s':" % cmd)
                ret = subprocess.call(cmd, shell=True)
                if ret != 0:
                    printtime(
                        "ERROR: empty bam file generation failed, return code: %d"
                        % ret)
                    raise RuntimeError('exit code: %d' % ret)

                ionstats.generate_ionstats_basecaller(
                    ['empty_dummy.bam'],
                    os.path.join(BASECALLER_RESULTS,
                                 'ionstats_tmp_basecaller.json'), library_key,
                    graph_max_x)
            except:
                raise

        ionstatslist = []
        a = os.path.join(ALIGNMENT_RESULTS, 'ionstats_alignment.json')
        b = os.path.join(BASECALLER_RESULTS, 'ionstats_tmp_basecaller.json')
        if os.path.exists(a):
            ionstatslist.append(a)
        if os.path.exists(b):
            ionstatslist.append(b)
        if len(ionstatslist) > 0:
            ionstats.reduce_stats(
                ionstatslist,
                os.path.join(BASECALLER_RESULTS,
                             'ionstats_basecaller_with_aligninfos.json'))
            ionstats.reduce_stats(
                reversed(ionstatslist),
                os.path.join(BASECALLER_RESULTS, 'ionstats_basecaller.json'))
#        if len(ionstats_alignment_h5_file_list) > 0:
#            ionstats.reduce_stats_h5(ionstats_alignment_h5_file_list,os.path.join(ALIGNMENT_RESULTS,'ionstats_error_summary.h5'))

    printtime("**** Alignment completed ****")
Пример #4
0
def process_datasets(
        blocks,
        alignmentArgs,
        ionstatsArgs,
        BASECALLER_RESULTS,
        basecaller_meta_information,
        library_key,
        graph_max_x,
        basecaller_datasets,
        ALIGNMENT_RESULTS,
        do_realign,
        do_ionstats,
        do_mark_duplicates,
        do_indexing,
        barcodeInfo):

    printtime("Attempt to align")

    do_sorting = True

    # compare with pipeline/python/ion/utils/ionstats.py
    ionstats_basecaller_file_list = []
    ionstats_alignment_file_list = []
    ionstats_basecaller_filtered_file_list = []
    ionstats_alignment_filtered_file_list = []

    for dataset in basecaller_datasets["datasets"]:

        read_group = dataset['read_groups'][0]
        reference = basecaller_datasets['read_groups'][read_group]['reference']
        #print "DEBUG: reference: %s' % reference

        filtered = True
        for rg_name in dataset["read_groups"]:
            if not basecaller_datasets["read_groups"][rg_name].get('filtered',False):
                filtered = False

        # skip non-existing bam file
        if int(dataset["read_count"]) == 0:
            continue

        if reference:

            # merge unmapped bam files TODO move into align
            try:
                bamdir = BASECALLER_RESULTS
                bamfile = dataset['basecaller_bam']
                block_bam_list = [os.path.join(blockdir, bamdir, bamfile) for blockdir in blocks]
                block_bam_list = [block_bam_filename for block_bam_filename in block_bam_list if os.path.exists(block_bam_filename)]
                composite_bam_filepath = os.path.join(bamdir, bamfile)
                if block_bam_list:
                    composite_bai_filepath=""
                    mark_duplicates=False
                    method='samtools'
                    blockprocessing.merge_bam_files(block_bam_list, composite_bam_filepath, composite_bai_filepath, mark_duplicates, method)
            except:
                traceback.print_exc()
                printtime("ERROR: merging %s unsuccessful" % bamfile)


            try:
                align(
                    blocks,
                    alignmentArgs,
                    ionstatsArgs,
                    reference,
                    basecaller_meta_information,
                    library_key,
                    graph_max_x,
                    os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam']),
                    do_realign,
                    do_ionstats,
                    do_sorting,
                    do_mark_duplicates,
                    do_indexing,
                    logfile=os.path.join(ALIGNMENT_RESULTS,dataset['file_prefix']+'.alignmentQC_out.txt'),
                    output_dir=ALIGNMENT_RESULTS,
                    output_basename=dataset['file_prefix'])
            except:
                traceback.print_exc()

            if filtered:
                ionstats_alignment_filtered_file_list.append(os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_alignment.json'))
            else:
                ionstats_alignment_file_list.append(os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_alignment.json'))

            '''
            if do_indexing:
                try:
                    composite_bam_filepath = os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.bam')
                    composite_bai_filepath = os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.bam.bai')
                    blockprocessing.create_index_file(composite_bam_filepath, composite_bai_filepath)
                except:
                    traceback.print_exc()
            '''

        else:

            # merge unmapped bam file without reference
            try:
                bamdir = BASECALLER_RESULTS
                bamfile = dataset['basecaller_bam']
                block_bam_list = [os.path.join(blockdir, bamdir, bamfile) for blockdir in blocks]
                block_bam_list = [block_bam_filename for block_bam_filename in block_bam_list if os.path.exists(block_bam_filename)]
                composite_bam_filepath = os.path.join(bamdir, bamfile)
                if block_bam_list:
                    composite_bai_filepath=""
                    mark_duplicates=False
                    method='samtools'
                    blockprocessing.merge_bam_files(block_bam_list, composite_bam_filepath, composite_bai_filepath, mark_duplicates, method)
            except:
                traceback.print_exc()
                printtime("ERROR: merging %s unsuccessful" % bamfile)


            if do_ionstats:
                # TODO: move ionstats basecaller into basecaller
                ionstats.generate_ionstats_basecaller(
                    [os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam'])],
                    os.path.join(BASECALLER_RESULTS, dataset['file_prefix']+'.ionstats_basecaller.json'),
                    library_key,
                    graph_max_x)

                if filtered:
                    ionstats_basecaller_filtered_file_list.append(os.path.join(BASECALLER_RESULTS, dataset['file_prefix']+'.ionstats_basecaller.json'))
                else:
                    ionstats_basecaller_file_list.append(os.path.join(BASECALLER_RESULTS, dataset['file_prefix']+'.ionstats_basecaller.json'))

    if do_ionstats:

        # Merge ionstats files from individual (barcoded) datasets
        if len(ionstats_alignment_file_list) > 0:
            ionstats.reduce_stats(ionstats_alignment_file_list,os.path.join(ALIGNMENT_RESULTS,'ionstats_alignment.json'))
        else: # barcode classification filtered all barcodes or no reads available
            # TODO: ionstats needs to produce initial json file
            try:
                #cmd = "echo $'@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam"
                cmd  = "echo  '@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam"

                printtime("DEBUG: Calling '%s':" % cmd)
                ret = subprocess.call(cmd,shell=True)
                if ret != 0:
                    printtime("ERROR: empty bam file generation failed, return code: %d" % ret)
                    raise RuntimeError('exit code: %d' % ret)

                ionstats.generate_ionstats_alignment(
                    ionstatsArgs,
                    ['empty_dummy.bam'],
                    os.path.join(ALIGNMENT_RESULTS, 'ionstats_alignment.json'),
                    os.path.join(ALIGNMENT_RESULTS, 'ionstats_error_summary.h5'),
                    basecaller_meta_information,
                    library_key,
                    graph_max_x)

            except:
                raise

        if len(ionstats_basecaller_file_list) > 0:
            ionstats.reduce_stats(ionstats_basecaller_file_list,os.path.join(BASECALLER_RESULTS,'ionstats_tmp_basecaller.json'))
        else: # barcode classification filtered all barcodes or no reads available
            # TODO: ionstats needs to produce initial json file
            try:
                #cmd = "echo $'@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam"
                cmd  = "echo  '@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam"

                printtime("DEBUG: Calling '%s':" % cmd)
                ret = subprocess.call(cmd,shell=True)
                if ret != 0:
                    printtime("ERROR: empty bam file generation failed, return code: %d" % ret)
                    raise RuntimeError('exit code: %d' % ret)

                ionstats.generate_ionstats_basecaller(
                    ['empty_dummy.bam'],
                    os.path.join(BASECALLER_RESULTS, 'ionstats_tmp_basecaller.json'),
                    library_key,
                    graph_max_x)
            except:
                raise


        ionstatslist = []
        a = os.path.join(ALIGNMENT_RESULTS,'ionstats_alignment.json')
        b = os.path.join(BASECALLER_RESULTS,'ionstats_tmp_basecaller.json')
        if os.path.exists(a):
            ionstatslist.append(a)
        if os.path.exists(b):
            ionstatslist.append(b)
        if len(ionstatslist) > 0:
            ionstats.reduce_stats( ionstatslist, os.path.join(BASECALLER_RESULTS,'ionstats_basecaller_with_aligninfos.json'))
            ionstats.reduce_stats( reversed(ionstatslist), os.path.join(BASECALLER_RESULTS,'ionstats_basecaller.json'))
#        if len(ionstats_alignment_h5_file_list) > 0:
#            ionstats.reduce_stats_h5(ionstats_alignment_h5_file_list,os.path.join(ALIGNMENT_RESULTS,'ionstats_error_summary.h5'))


    printtime("**** Alignment completed ****")
Пример #5
0
def process_datasets(
        blocks,
        alignmentArgs,
        ionstatsArgs,
        BASECALLER_RESULTS,
        basecaller_meta_information,
        library_key,
        graph_max_x,
        basecaller_datasets,
        ALIGNMENT_RESULTS,
        do_realign,
        do_ionstats,
        do_mark_duplicates,
        do_indexing,
        barcodeInfo):

    printtime("Attempt to align")
    printtime("DEBUG: PROCESS DATASETS blocks: '%s'" % blocks)

    do_sorting = True

    # TODO: compare with pipeline/python/ion/utils/ionstats.py
    ionstats_basecaller_file_list = []
    ionstats_alignment_file_list = []
    ionstats_basecaller_filtered_file_list = []
    ionstats_alignment_filtered_file_list = []

    for dataset in basecaller_datasets["datasets"]:

        read_group = dataset['read_groups'][0]
        reference = basecaller_datasets['read_groups'][read_group]['reference']
        #print "DEBUG: reference: %s' % reference

        filtered = True
        for rg_name in dataset["read_groups"]:
            if not basecaller_datasets["read_groups"][rg_name].get('filtered',False):
                filtered = False

        # skip non-existing bam file
        if int(dataset["read_count"]) == 0:
            continue

        try:

            # process block by block
            if reference and len(blocks) > 1 and int(dataset["read_count"]) > 20000000:
                printtime("DEBUG: TRADITIONAL BLOCK PROCESSING ------ prefix: %20s ----------- reference: %20s ---------- reads: %10s ----------" % (dataset['file_prefix'], reference, dataset["read_count"]))
              # start alignment for each block and current barcode with reads
              # TODO: in how many blocks are reads with this barcode
                for block in blocks:
                    printtime("DEBUG: ALIGN ONLY ONE BLOCK: %s" % block)
                    align(
                        [block],
                        os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam']),
                        alignmentArgs,
                        ionstatsArgs,
                        reference,
                        basecaller_meta_information,
                        library_key,
                        graph_max_x,
                        do_realign,
                        do_ionstats=False,
                        do_sorting=do_sorting,
                        do_mark_duplicates=False,
                        do_indexing=False,
                        output_dir=os.path.join(block,ALIGNMENT_RESULTS),
                        output_basename=dataset['file_prefix'])

                bamdir = '.' # TODO , do we need this ?
                bamBase = dataset['file_prefix']
                bamfile = dataset['file_prefix'] + ".bam"

#                printtime("DEBUG: BLOCKS for BAMFILE %s: %s" % (bamfile, blocks))
                block_bam_list = [os.path.join(blockdir, bamdir, bamfile) for blockdir in blocks]
#                printtime("DEBUG: block_bam_list: %s" % block_bam_list)
                block_bam_list = [block_bam_filename for block_bam_filename in block_bam_list if os.path.exists(block_bam_filename)]
#                printtime("DEBUG: block_bam_list: %s" % block_bam_list)
                printtime("blocks with reads:    %s" % len(block_bam_list))

                bamFile = dataset['file_prefix'] + ".bam"
                composite_bam_filepath = dataset['file_prefix'] + ".bam"

                blockprocessing.extract_and_merge_bam_header(block_bam_list,composite_bam_filepath)
                # Usage: samtools merge [-nr] [-h inh.sam] <out.bam> <in1.bam> <in2.bam> [...]
                cmd = 'samtools merge -l1 -@8'
                if do_ionstats:
                    cmd += ' - '
                else:
                    cmd += ' %s' % (composite_bam_filepath)
                for bamfile in block_bam_list:
                    cmd += ' %s' % bamfile
                cmd += ' -h %s.header.sam' % composite_bam_filepath

                if do_ionstats:
                    bam_filenames=["/dev/stdin"]
                    ionstats_alignment_filename="%s.ionstats_alignment.json" % bamBase      # os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_alignment.json')
                    ionstats_alignment_h5_filename="%s.ionstats_error_summary.h5" % bamBase # os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_error_summary.h5')

                    ionstats_cmd = ionstats.generate_ionstats_alignment_cmd(
                               ionstatsArgs,
                               bam_filenames,
                               ionstats_alignment_filename,
                               ionstats_alignment_h5_filename,
                               basecaller_meta_information,
                               library_key,
                               graph_max_x)

                    cmd += " | tee >(%s)" % ionstats_cmd

                if do_mark_duplicates:
                    json_name = 'BamDuplicates.%s.json' % bamBase if bamBase!='rawlib' else 'BamDuplicates.json'
                    cmd = "BamDuplicates -i <(%s) -o %s -j %s" % (cmd, bamFile, json_name)
                else:
                    cmd += " > %s.bam" % bamBase

                printtime("DEBUG: Calling '%s':" % cmd)
                ret = subprocess.Popen(['/bin/bash', '-c', cmd]).wait()
                if ret != 0:
                    printtime("ERROR: merging failed, return code: %d" % ret)
                    raise RuntimeError('exit code: %d' % ret)

                # TODO: piping into samtools index or create index in sort process ?
                if do_indexing and do_sorting:
                    cmd = "samtools index " + bamFile
                    printtime("DEBUG: Calling '%s':" % cmd)
                    subprocess.call(cmd,shell=True)



            else:
                printtime("DEBUG: MERGED BLOCK PROCESSING ----------- prefix: %20s ----------- reference: %20s ---------- reads: %10s ----------" % (dataset['file_prefix'], reference, dataset["read_count"]))
                # TODO: try a python multiprocessing pool
                align(
                    blocks,
                    os.path.join(BASECALLER_RESULTS, dataset['basecaller_bam']),
                    alignmentArgs,
                    ionstatsArgs,
                    reference,
                    basecaller_meta_information,
                    library_key,
                    graph_max_x,
                    do_realign,
                    do_ionstats,
                    do_sorting,
                    do_mark_duplicates,
                    do_indexing,
                    output_dir=ALIGNMENT_RESULTS if reference else BASECALLER_RESULTS,
                    output_basename=dataset['file_prefix'])
        except:
                traceback.print_exc()


        if reference:
            if filtered:
                ionstats_alignment_filtered_file_list.append(os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_alignment.json'))
            else:
                ionstats_alignment_file_list.append(os.path.join(ALIGNMENT_RESULTS, dataset['file_prefix']+'.ionstats_alignment.json'))
        else:
            if filtered:
                ionstats_basecaller_filtered_file_list.append(os.path.join(BASECALLER_RESULTS, dataset['file_prefix']+'.ionstats_basecaller.json'))
            else:
                ionstats_basecaller_file_list.append(os.path.join(BASECALLER_RESULTS, dataset['file_prefix']+'.ionstats_basecaller.json'))

    if do_ionstats:

        # Merge ionstats files from individual (barcoded) datasets
        if len(ionstats_alignment_file_list) > 0:
            ionstats.reduce_stats(ionstats_alignment_file_list,os.path.join(ALIGNMENT_RESULTS,'ionstats_alignment.json'))
        else: # barcode classification filtered all barcodes or no reads available
            # TODO: ionstats needs to produce initial json file
            try:
                #cmd = "echo $'@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam"
                cmd  = "echo  '@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam"

                printtime("DEBUG: Calling '%s':" % cmd)
                ret = subprocess.call(cmd,shell=True)
                if ret != 0:
                    printtime("ERROR: empty bam file generation failed, return code: %d" % ret)
                    raise RuntimeError('exit code: %d' % ret)

                ionstats.generate_ionstats_alignment(
                    ionstatsArgs,
                    ['empty_dummy.bam'],
                    os.path.join(ALIGNMENT_RESULTS, 'ionstats_alignment.json'),
                    os.path.join(ALIGNMENT_RESULTS, 'ionstats_error_summary.h5'),
                    basecaller_meta_information,
                    library_key,
                    graph_max_x)

            except:
                raise

        if len(ionstats_basecaller_file_list) > 0:
            ionstats.reduce_stats(ionstats_basecaller_file_list,os.path.join(BASECALLER_RESULTS,'ionstats_tmp_basecaller.json'))
        else: # barcode classification filtered all barcodes or no reads available
            # TODO: ionstats needs to produce initial json file
            try:
                #cmd = "echo $'@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam"
                cmd  = "echo  '@HD\tVN:1.5\tSO:coordinate\n@SQ\tSN:ref\tLN:4\n@RG\tID:filename\tSM:filename' | samtools view -F4 -S -b - > empty_dummy.bam"

                printtime("DEBUG: Calling '%s':" % cmd)
                ret = subprocess.call(cmd,shell=True)
                if ret != 0:
                    printtime("ERROR: empty bam file generation failed, return code: %d" % ret)
                    raise RuntimeError('exit code: %d' % ret)

                ionstats.generate_ionstats_basecaller(
                    ['empty_dummy.bam'],
                    os.path.join(BASECALLER_RESULTS, 'ionstats_tmp_basecaller.json'),
                    library_key,
                    graph_max_x)
            except:
                raise


        ionstatslist = []
        a = os.path.join(ALIGNMENT_RESULTS,'ionstats_alignment.json')
        b = os.path.join(BASECALLER_RESULTS,'ionstats_tmp_basecaller.json')
        if os.path.exists(a):
            ionstatslist.append(a)
        if os.path.exists(b):
            ionstatslist.append(b)
        if len(ionstatslist) > 0:
            ionstats.reduce_stats( ionstatslist, os.path.join(BASECALLER_RESULTS,'ionstats_basecaller_with_aligninfos.json'))
            ionstats.reduce_stats( reversed(ionstatslist), os.path.join(BASECALLER_RESULTS,'ionstats_basecaller.json'))
#        if len(ionstats_alignment_h5_file_list) > 0:
#            ionstats.reduce_stats_h5(ionstats_alignment_h5_file_list,os.path.join(ALIGNMENT_RESULTS,'ionstats_error_summary.h5'))


    printtime("**** Alignment completed ****")