def adam_convert(master_ip, inputs, in_file, in_snps, adam_file, adam_snps, spark_on_toil):
    """
    Convert input sam/bam file and known SNPs file into ADAM format
    """

    log.info("Converting input BAM to ADAM.")
    call_adam(master_ip, ["transform", in_file, adam_file], memory=inputs['memory'])

    in_file_name = in_file.split("/")[-1]
    remove_file(master_ip, in_file_name, spark_on_toil)

    log.info("Converting known sites VCF to ADAM.")

    call_adam(master_ip, ["vcf2adam", "-only_variants", in_snps, adam_snps], memory=inputs['memory'])

    in_snps_name = in_snps.split("/")[-1]
    remove_file(master_ip, in_snps_name, spark_on_toil)
def adam_transform(master_ip, inputs, in_file, snp_file, hdfs_dir, out_file, spark_on_toil):
    """
    Preprocess in_file with known SNPs snp_file:
        - mark duplicates
        - realign indels
        - recalibrate base quality scores
    """

    log.info("Marking duplicate reads.")
    call_adam(master_ip,
              ["transform",
               in_file,  hdfs_dir + "/mkdups.adam",
               "-aligned_read_predicate",
               "-limit_projection",
               "-mark_duplicate_reads"],
              memory=inputs['memory'])

    #FIXME
    in_file_name = in_file.split("/")[-1]
    remove_file(master_ip, in_file_name + "*", spark_on_toil)

    log.info("Realigning INDELs.")
    call_adam(master_ip,
              ["transform",
               hdfs_dir + "/mkdups.adam",
               hdfs_dir + "/ri.adam",
               "-realign_indels"],
              memory=inputs['memory'])

    remove_file(master_ip, hdfs_dir + "/mkdups.adam*", spark_on_toil)

    log.info("Recalibrating base quality scores.")
    call_adam(master_ip,
              ["transform",
               hdfs_dir + "/ri.adam",
               hdfs_dir + "/bqsr.adam",
               "-recalibrate_base_qualities",
               "-known_snps", snp_file],
              memory=inputs['memory'])

    remove_file(master_ip, "ri.adam*", spark_on_toil)

    log.info("Sorting reads and saving a single BAM file.")
    call_adam(master_ip,
              ["transform",
               hdfs_dir + "/bqsr.adam",
               out_file,
               "-sort_reads", "-single"],
              memory=inputs['memory'])

    remove_file(master_ip, "bqsr.adam*", spark_on_toil)

    return out_file
Пример #3
0
def download_count_upload(job,
                          master_ip,
                          input_file,
                          output_file,
                          kmer_length,
                          spark_conf,
                          memory,
                          sudo):
    '''
    Runs k-mer counting.

    1. If the input file is located in S3, the file is copied into HDFS.
    2. If the input file is not in Parquet format, the file is converted into Parquet.
    3. The k-mers are counted and saved as text.
    4. If the output path is an S3 URL, the file is copied back to S3.

    :param job: Toil job
    :param input_file: URL/path to input file to count k-mers on
    :param output_file: URL/path to save k-mer counts at
    :param kmer_length: The length of k-mer substrings to count.
    :param spark_conf: Optional Spark configuration. If set, memory should \
    not be set.
    :param memory: Amount of memory to provided to Spark workers. Must be set \
    if spark_conf is not set.
    :param sudo: Whether or not to run Spark containers with sudo.

    :type job: toil.Job
    :type input_file: string
    :type output_file: string
    :type kmer_length: int or string
    :type spark_conf: list of string or None
    :type memory: int or None
    :type sudo: boolean
    '''

    if master_ip is not None:
        hdfs_dir = "hdfs://{0}:{1}/".format(master_ip, HDFS_MASTER_PORT)
    else:
        _log.warn('Master IP is not set. If default filesystem is not set, jobs may fail.')
        hdfs_dir = ""

    # if the file isn't already in hdfs, copy it in
    hdfs_input_file = hdfs_dir
    if input_file.startswith("s3://"):

        # append the s3 file name to our hdfs path
        hdfs_input_file += input_file.split("/")[-1]

        # run the download
        _log.info("Downloading input file %s to %s.", input_file, hdfs_input_file)
        call_conductor(master_ip, input_file, hdfs_input_file,
                       memory=memory, override_parameters=spark_conf)

    else:
        if not input_file.startswith("hdfs://"):
            _log.warn("If not in S3, input file (%s) expected to be in HDFS (%s).",
                      input_file, hdfs_dir)

    # where are we writing the output to? is it going to a location in hdfs or not?
    run_upload = True
    hdfs_output_file = hdfs_dir + "kmer_output.txt"
    if output_file.startswith(hdfs_dir):
        run_upload = False
        hdfs_output_file = output_file
    
    # do we need to convert to adam?
    if (hdfs_input_file.endswith('.bam') or
        hdfs_input_file.endswith('.sam') or
        hdfs_input_file.endswith('.fq') or
        hdfs_input_file.endswith('.fastq')):
        
        hdfs_tmp_file = hdfs_input_file

        # change the file extension to adam
        hdfs_input_file = '.'.join(hdfs_input_file.split('.')[:-1].append('adam'))

        # convert the file
        _log.info('Converting %s into ADAM format at %s.', hdfs_tmp_file, hdfs_input_file)
        call_adam(master_ip,
                  ['transform',
                   hdfs_tmp_file, hdfs_input_file],
                  memory=memory, override_parameters=spark_conf)
        
    # run k-mer counting
    _log.info('Counting %d-mers in %s, and saving to %s.',
              kmer_length, hdfs_input_file, hdfs_output_file)
    call_adam(master_ip,
              ['count_kmers',
               hdfs_input_file, hdfs_output_file,
               str(kmer_length)],
              memory=memory, override_parameters=spark_conf)

    # do we need to upload the file back? if so, run upload
    if run_upload:
        _log.info("Uploading output file %s to %s.", hdfs_output_file, output_file)
        call_conductor(master_ip, hdfs_output_file, output_file,
                       memory=memory, override_parameters=spark_conf)