def upload_data(job, master_ip, inputs, hdfs_name, upload_name, spark_on_toil):
    """
    Upload file hdfsName from hdfs to s3
    """

    if mock_mode():
        truncate_file(master_ip, hdfs_name, spark_on_toil)

    log.info("Uploading output BAM %s to %s.", hdfs_name, upload_name)
    call_conductor(job, master_ip, hdfs_name, upload_name, memory=inputs.memory)
    remove_file(master_ip, hdfs_name, spark_on_toil)
Exemplo n.º 2
0
def upload_data(job, master_ip, inputs, hdfs_name, upload_name, spark_on_toil):
    """
    Upload file hdfsName from hdfs to s3
    """

    if mock_mode():
        truncate_file(master_ip, hdfs_name, spark_on_toil)

    log.info("Uploading output BAM %s to %s.", hdfs_name, upload_name)
    call_conductor(job,
                   master_ip,
                   hdfs_name,
                   upload_name,
                   memory=inputs.memory)
    remove_file(master_ip, hdfs_name, spark_on_toil)
Exemplo n.º 3
0
def generate_config():
    if mock_mode():
        return generate_mock_config()

    return textwrap.dedent("""
        # ADAM/GATK Pipeline configuration file
        # This configuration file is formatted in YAML. Simply write the value (at least one space) after the colon.
        # Edit the values in this configuration file and then rerun the pipeline
        # Comments (beginning with #) do not need to be removed. Optional parameters may be left blank.
        ##############################################################################################################
        pipeline-to-run: both     #
        skip-alignment: False     #
        skip-preprocessing: False #
        sequence-dir: sequence    #
        autoscale-cluster: False  #
        s3-bucket:                # S3 Bucket URI
        cpu-count:                # Optional:
        program-unit: 12345       #
        platform: ILLUMINA        #
        ref:                      # Required: Reference fasta file
        amb:                      # Required: Reference fasta file (amb)
        ann:                      # Required: Reference fasta file (ann)
        bwt:                      # Required: Reference fasta file (bwt)
        pac:                      # Required: Reference fasta file (pac)
        sa:                       # Required: Reference fasta file (sa)
        fai:                      # Required: Reference fasta file (fai)
        alt:                      # Optional: Alternate file for reference build (alt). Necessary for alt aware alignment.
        phase:                    # Required: URL (1000G_phase1.indels.hg19.sites.fixed.vcf)
        mills:                    # Required: URL (Mills_and_1000G_gold_standard.indels.hg19.sites.vcf)
        dbsnp:                    # Required: URL (dbsnp_132_b37.leftAligned.vcf)
        hapmap:                   # Required: URL (hapmap_3.3.b37.vcf)
        omni:                     # Required: URL (1000G_omni.5.b37.vcf)
        trim-adapters: False      # Trim adapters.
        num-nodes: 9              # Number of nodes to use. Do not set if providing master_ip.
        master-ip:                # Optional: IP or hostname of host running for Spark master and HDFS namenode.
                                  # Should be provided instead of num-nodes if pointing at a static (external or
                                  # standalone) Spark cluster. The special value 'auto' indicates the master of
                                  # an externally autoscaled cgcloud spark cluster, i.e. one that is managed by
                                  # the uberscript.
        file-size: 100G           # Approximate input file size. Should be given as %d[TGMK], e.g.,
                                  # for a 100 gigabyte file, use file_size: '100G'
        ssec:                     # Optional: (string) Path to Key File for SSE-C Encryption
        dir-suffix:               # Optional: suffix to add to output directory names.
        memory:                   # Required: Amount of available memory on each worker node.                                   
    """[1:])
Exemplo n.º 4
0
def generate_manifest():
    if mock_mode():
        return generate_mock_manifest()
    return textwrap.dedent("""
        #   Edit this manifest to include information pertaining to each sample to be run.
        #   There is a single column: UUID
        #
        #   UUID        This should be a unique identifier for the sample to be processed that corresponds to 
        #               the prefix of the filenames of the input fastq files.
        #   
        #   Example:
        #   If your input fastq file pairs were input_file_name_1.illumina_1.fastq.gz, input_file_name_1.illumina_2.fastq.gz and 
        #   input_file_name_2.illumina_1.fastq.gz, input_file_name_2.illumina_2.fastq.gz, the manifest would be:
        #
        #   input_file_name_1.illumina
        #   input_file_name_2.illumina   
        #
        #   Input fastq files MUST be named according to the filename_1.fastq.gz, filename_2.fastq.gz convention
        #
        #   Place your samples below, one per line.
        """[1:])
Exemplo n.º 5
0
def main():
    """
    This is a Toil pipeline used to perform alignment of fastqs.
    """
    # Define Parser object and add to Toil
    if mock_mode():
        usage_msg = 'You have the TOIL_SCRIPTS_MOCK_MODE environment variable set, so this pipeline ' \
                    'will run in mock mode. To disable mock mode, set TOIL_SCRIPTS_MOCK_MODE=0'
    else:
        usage_msg = None

    parser = argparse.ArgumentParser(usage=usage_msg)
    subparsers = parser.add_subparsers(dest='command')
    subparsers.add_parser('generate-config', help='Generates an editable config in the current working directory.')
    subparsers.add_parser('generate-manifest', help='Generates an editable manifest in the current working directory.')
    subparsers.add_parser('generate', help='Generates a config and manifest in the current working directory.')
    # Run subparser                                                                                                              
    parser_run = subparsers.add_parser('run', help='Runs the ADAM/GATK pipeline')
    default_config = 'adam-gatk-mock.config' if mock_mode() else 'adam-gatk.config'
    default_manifest = 'adam-gatk-mock-manifest.csv' if mock_mode() else 'adam-gatk-manifest.csv'
    parser_run.add_argument('--config', default=default_config, type=str,
                            help='Path to the (filled in) config file, generated with "generate-config".')
    parser_run.add_argument('--manifest', default=default_manifest,
                            type=str, help='Path to the (filled in) manifest file, generated with "generate-manifest". '
                                           '\nDefault value: "%(default)s".')
    Job.Runner.addToilOptions(parser_run)
    args = parser.parse_args()

    cwd = os.getcwd()
    if args.command == 'generate-config' or args.command == 'generate':
        generate_file(os.path.join(cwd, default_config), generate_config)
    if args.command == 'generate-manifest' or args.command == 'generate':
        generate_file(os.path.join(cwd, default_manifest), generate_manifest)
    # Pipeline execution
    elif args.command == 'run':
        require(os.path.exists(args.config), '{} not found. Please run '
                                             'generate-config'.format(args.config))
        if not hasattr(args, 'sample'):
            require(os.path.exists(args.manifest), '{} not found and no samples provided. Please '
                                                   'run "generate-manifest"'.format(args.manifest))
        # Parse config
        parsed_config = {x.replace('-', '_'): y for x, y in yaml.load(open(args.config).read()).iteritems()}
        inputs = argparse.Namespace(**parsed_config)

        # Parse manifest file
        uuid_list = []
        with open(args.manifest) as f_manifest:
            for line in f_manifest:
                if not line.isspace() and not line.startswith('#'):
                    uuid_list.append(line.strip())

        inputs.sort = False
        if not inputs.dir_suffix:
            inputs.dir_suffix = ''
        if not inputs.s3_bucket:
            inputs.s3_bucket = ''

        if inputs.master_ip and inputs.num_nodes:
            raise ValueError("Exactly one of master_ip (%s) and num_nodes (%d) must be provided." %
                             (inputs.master_ip, inputs.num_nodes))

        if not hasattr(inputs, 'master_ip') and inputs.num_nodes <= 1:
            raise ValueError('num_nodes allocates one Spark/HDFS master and n-1 workers, and thus must be greater '
                             'than 1. %d was passed.' % inputs.num_nodes)

        if (inputs.pipeline_to_run != "adam" and
            inputs.pipeline_to_run != "gatk" and
            inputs.pipeline_to_run != "both"):
            raise ValueError("pipeline_to_run must be either 'adam', 'gatk', or 'both'. %s was passed." % inputs.pipeline_to_run)

        Job.Runner.startToil(Job.wrapJobFn(sample_loop, uuid_list, inputs), args)