def upload_data(masterIP, inputs, hdfsName, uploadName, sparkOnToil):
    """
    Upload file hdfsName from hdfs to s3
    """

    if mock_mode():
        truncate_file(masterIP, hdfsName, sparkOnToil)

    log.info("Uploading output BAM %s to %s.", hdfsName, uploadName)
    call_conductor(masterIP, inputs, hdfsName, uploadName)
def upload_data(master_ip, inputs, hdfs_name, upload_name, spark_on_toil):
    """
    Upload file hdfsName from hdfs to s3
    """

    if mock_mode():
        truncate_file(master_ip, hdfs_name, spark_on_toil)

    log.info("Uploading output BAM %s to %s.", hdfs_name, upload_name)
    call_conductor(master_ip, hdfs_name, upload_name, memory=inputs['memory'])
def generate_config():
    if mock_mode():
        return generate_mock_config()

    return textwrap.dedent("""
        # ADAM/GATK Pipeline configuration file
        # This configuration file is formatted in YAML. Simply write the value (at least one space) after the colon.
        # Edit the values in this configuration file and then rerun the pipeline
        # Comments (beginning with #) do not need to be removed. Optional parameters may be left blank.
        ##############################################################################################################
        pipeline-to-run: both     #
        skip-alignment: False     #
        skip-preprocessing: False #
        sequence-dir: sequence    #
        autoscale-cluster: False  #
        s3-bucket:                # S3 Bucket URI
        cpu-count:                # Optional:
        program-unit: 12345       #
        platform: ILLUMINA        #
        ref:                      # Required: Reference fasta file
        amb:                      # Required: Reference fasta file (amb)
        ann:                      # Required: Reference fasta file (ann)
        bwt:                      # Required: Reference fasta file (bwt)
        pac:                      # Required: Reference fasta file (pac)
        sa:                       # Required: Reference fasta file (sa)
        fai:                      # Required: Reference fasta file (fai)
        alt:                      # Optional: Alternate file for reference build (alt). Necessary for alt aware alignment.
        phase:                    # Required: URL (1000G_phase1.indels.hg19.sites.fixed.vcf)
        mills:                    # Required: URL (Mills_and_1000G_gold_standard.indels.hg19.sites.vcf)
        dbsnp:                    # Required: URL (dbsnp_132_b37.leftAligned.vcf)
        hapmap:                   # Required: URL (hapmap_3.3.b37.vcf)
        omni:                     # Required: URL (1000G_omni.5.b37.vcf)
        trim-adapters: False      # Trim adapters.
        num-nodes: 9              # Number of nodes to use. Do not set if providing master_ip.
        master-ip:                # Optional: IP or hostname of host running for Spark master and HDFS namenode.
                                  # Should be provided instead of num-nodes if pointing at a static (external or
                                  # standalone) Spark cluster. The special value 'auto' indicates the master of
                                  # an externally autoscaled cgcloud spark cluster, i.e. one that is managed by
                                  # the uberscript.
        file-size: 100G           # Approximate input file size. Should be given as %d[TGMK], e.g.,
                                  # for a 100 gigabyte file, use file_size: '100G'
        ssec:                     # Optional: (string) Path to Key File for SSE-C Encryption
        dir-suffix:               # Optional: suffix to add to output directory names.
        memory:                   # Required: Amount of available memory on each worker node.                                   
    """[1:])
def generate_manifest():
    if mock_mode():
        return generate_mock_manifest()
    return textwrap.dedent("""
        #   Edit this manifest to include information pertaining to each sample to be run.
        #   There is a single column: UUID
        #
        #   UUID        This should be a unique identifier for the sample to be processed that corresponds to 
        #               the prefix of the filenames of the input fastq files.
        #   
        #   Example:
        #   If your input fastq file pairs were input_file_name_1.illumina_1.fastq.gz, input_file_name_1.illumina_2.fastq.gz and 
        #   input_file_name_2.illumina_1.fastq.gz, input_file_name_2.illumina_2.fastq.gz, the manifest would be:
        #
        #   input_file_name_1.illumina
        #   input_file_name_2.illumina   
        #
        #   Input fastq files MUST be named according to the filename_1.fastq.gz, filename_2.fastq.gz convention
        #
        #   Place your samples below, one per line.
        """[1:])
def launch_pipeline(params):
    """
    Launches pipeline in a screen session on toil-leader. 

    :param argparse.Namespace params: parsed command line arguments and options 
    """
    if not params.jobstore:
        jobstore = '{}-{}'.format(uuid4(), str(datetime.utcnow().date()))
    else:
        jobstore = params.jobstore
    restart = '--restart' if params.restart else ''
    log.info('Launching Pipeline and blocking. Check log.txt on leader for stderr and stdout')
    try:
        # Create screen session
        check_call(['cgcloud',
                    'ssh',
                    '--zone', params.zone,
                    '--cluster-name', params.cluster_name,
                    'toil-leader',
                    '-o', 'StrictHostKeyChecking=no',
                    'screen', '-dmS', params.cluster_name])

        if params.reference_genome == 'GRCh38':
            from toil_scripts.adam_uberscript.input_files import GRCh38_inputs as inputs
        elif params.reference_genome == 'hg19':
            from toil_scripts.adam_uberscript.input_files import hg19_inputs as inputs
        else:
            assert False, 'Invalid ref genome %s' % params.reference_genome

        # Assemble pipeline command to be stuffed into a screen session
        
        pipeline_command = ['PYTHONPATH=$PYTHONPATH:~/toil-scripts/src',
                            'python -m toil_scripts.adam_gatk_pipeline.align_and_call',
                            'aws:{region}:{j}',
                            '--autoscale_cluster',
                            '--retryCount 1',
                            '--use_bwakit',
                            '--driver_memory {m}',
                            '--executor_memory {m}',
                            '--batchSystem mesos',
                            '--mesosMaster $(hostname -i):5050',
                            '--workDir /var/lib/toil',
                            '--logInfo']
        if mock_mode():
            pipeline_command = ['ADAM_GATK_MOCK_MODE=1'] + \
                               pipeline_command + \
                               ['--dir_suffix /mock']
        else:    
            pipeline_command += ['--s3_bucket {b}',
                                 '--bucket_region {region}',
                                 '--sequence_dir {sequence_dir}',
                                 '--dir_suffix /{genome}',
                                 '--uuid_manifest ~/manifest',
                                 '--ref {ref}',
                                 '--amb {amb}',
                                 '--ann {ann}',
                                 '--bwt {bwt}',
                                 '--pac {pac}',
                                 '--sa {sa}',
                                 '--fai {fai}',
                                 '--phase {phase}',
                                 '--mills {mills}',
                                 '--dbsnp {dbsnp}',
                                 '--omni {omni}',
                                 '--hapmap {hapmap}',
                                 '--file_size {fs}']

            if 'alt' in inputs:
                pipeline_command.append('--alt {alt}')

        # Do we have a defined master IP?
        if params.master_ip:
            pipeline_command.append('--master_ip %s' % params.master_ip)
        elif params.spark_nodes:
            pipeline_command.append('--num_nodes %s' % params.spark_nodes)

        pipeline_command.append('{r} 2>&1 | tee toil_output\n')

        pipeline_command = ' '.join(pipeline_command)
        pipeline_command = pipeline_command.format(j=jobstore,
                                                   b=params.bucket,
                                                   region=region_of_zone(params.zone),
                                                   m=params.memory,
                                                   fs=params.file_size,
                                                   r=restart,
                                                   sequence_dir=params.sequence_dir,
                                                   genome=params.reference_genome,
                                                   **inputs)

        chunk_size = 500
        for chunk in [pipeline_command[i:i + chunk_size] for i in range(0, len(pipeline_command), chunk_size)]:
            check_call(['cgcloud',
                        'ssh',
                        '--zone', params.zone,
                        '--cluster-name', params.cluster_name,
                        'toil-leader',
                        '-o', 'StrictHostKeyChecking=no',
                        'screen', '-S', params.cluster_name,
                        '-X', 'stuff', quote(chunk)])

    except CalledProcessError as e:
        log.info('Pipeline exited with non-zero status code: {}'.format(e))
def main():
    parser = argparse.ArgumentParser(description=main.__doc__,
                                     formatter_class=argparse.RawDescriptionHelpFormatter)
    subparsers = parser.add_subparsers(dest='command')
    
    # Launch cluster
    cluster_sp = subparsers.add_parser('launch-cluster',
                                       help='Launches EC2 cluster via CGCloud')
    cluster_sp.add_argument('-S', '--share', required=True,
                            help='Full path to directory containing pipeline script, launch script, and master key.')
    cluster_sp.add_argument('-T', '--leader-type', default='m3.medium',
                            help='Sets leader instance type.')
    cluster_sp.add_argument('-b', '--boto-path', default='/home/mesosbox/.boto', type=str,
                            help='Path to local .boto file to be placed on leader.')
    cluster_sp.add_argument('-M', '--manifest-path', default = None if not mock_mode() else "/home/ubuntu/toil-scripts/src/toil_scripts/adam_gatk_pipeline/mock_manifest",
                            required=not mock_mode(), help='Path to manifest file.')

    # Launch pipeline
    pipeline_sp = subparsers.add_parser('launch-pipeline',
                                        help='Launches pipeline')
    pipeline_sp.add_argument('-j', '--jobstore', default=None,
                             help='Name of jobstore. Defaults to UUID-Date if not set')
    pipeline_sp.add_argument('--restart', default=None, action='store_true',
                             help='Attempts to restart pipeline, requires existing job store.')
    pipeline_sp.add_argument('--master_ip', default=None,
                             help="The address of an external Spark master or 'auto' when using a standalone Spark "
                                  "cluster managed by this script. In that latter case you must pass the "
                                  "--max-samples-on-spark option to the launch-metric command.")
    pipeline_sp.add_argument('-B', '--bucket',
                             help='The name of the destination bucket.')
    pipeline_sp.add_argument('-m', '--memory', default='200g' if not mock_mode() else '3g',
                             help='The amount of memory per worker node in GiB. Must match what EC2 provides on '
                                  'the specified worker instance type. Defaults to 3 in mock mode')
    pipeline_sp.add_argument('-f', '--file_size', default='100G' if not mock_mode() else '10M',
                             help='Approximate size of the BAM files. Defaults to 10M in mock mode')
    pipeline_sp.add_argument('-s', '--spark_nodes', type=int, default=(8 if not mock_mode() else 2) + 1,
                             help="The number of Spark nodes, including the master, to allocate per sample. Relevant "
                                  "with separate running against a standSpark cluster managed, the master will be "
                                  "shared by all samples and the actual number of workers allocated per sample will be "
                                  "one less than specified here. Otherwise, each sample's subcluster will get its own "
                                  "master node. Default 9 in production mode, 3 in mock mode.")
    pipeline_sp.add_argument('-SD', '--sequence_dir', default='sequence',
                             help='Directory where raw sequences are.')
    pipeline_sp.add_argument('-R', '--reference_genome', default='GRCh38',
                             choices=['GRCh38', 'hg19'],
                             help='Reference genome to align and call against. Choose between GRCh38 and hg19.')

    # Launch metric collection
    metric_sp = subparsers.add_parser('launch-metrics',
                                      help='Launches metric collection thread')
    metric_sp.add_argument('-j', '--jobstore', required=True,  # differs subtly from launch-pipeline's --jobstore
                           help='Name of jobstore')
    metric_sp.add_argument('--namespace', default=os.environ.get('CGCLOUD_NAMESPACE', '/'),
                           help='CGCloud NameSpace')
    metric_sp.add_argument('--spark-sample-slots', required=False, default=0, type=int,
                           help='The maximum number of samples to be computed concurrently on a standalone Spark '
                                'cluster managed by this script. To be used in conjunction with the --master_ip=auto '
                                'option of the launch-cluster command. The default of 0 disables the standalone Spark '
                                'cluster.')

    # Common options
    cgcloud_zone = os.environ.get('CGCLOUD_ZONE')
    for sp in cluster_sp, pipeline_sp, metric_sp:
        sp.add_argument('-c', '--cluster-name', required=True,
                        help='The CGCloud cluster name for Toil leader and workers.')
        sp.add_argument('-z', '--zone', required=cgcloud_zone is None, default=cgcloud_zone,
                        help="The EC2 availability zone in which to place on-demand instances like the leaders of the "
                             "Toil and standalone Spark clusters. Also determines the region of the S3 bucket and SDB "
                             "domain for Toil's job store. The availability zone for spot instances may be chosen "
                             "independently from all zones in the region containing the specified zone.")
    for sp in cluster_sp, metric_sp:
        sp.add_argument('-t', '--instance-type', default='r3.8xlarge' if not mock_mode() else 'c3.large',
                        help='Worker instance type, e.g. m4.large or c3.8xlarge. Defaults to r3.8xlarge in production '
                             'mode. Will always use c3.large in mock mode, regardless of input value.')
        sp.add_argument('--spot-price', default=None, required=False,
                        help='Instance spot price if desired.')
    for sp in metric_sp, cluster_sp:
        sp.add_argument('-etc', '--add-to-etc-hosts', default=None, required=False,
                        help='Deprecated. Optional entry to add to /etc/hosts on Toil workers. This should *not* be '
                             'used to communicate the address of a standalone Spark master to driver jobs running on '
                             'Toil nodes. Use --master_ip=auto instead.')

    params = parser.parse_args()
    
    if params.command == 'launch-pipeline' and mock_mode() and params.master_ip:
        params.spark_sample_slots = 1

    if params.command == 'launch-cluster':
        launch_cluster(params)
        place_boto_on_leader(params)
    elif params.command == 'launch-pipeline':
        launch_pipeline(params)
    elif params.command == 'launch-metrics':
        manage_metrics_and_cluster_scaling(params)
def main():
    """
    This is a Toil pipeline used to perform alignment of fastqs.
    """
    # Define Parser object and add to Toil
    if mock_mode():
        usage_msg = 'You have the TOIL_SCRIPTS_MOCK_MODE environment variable set, so this pipeline ' \
                    'will run in mock mode. To disable mock mode, set TOIL_SCRIPTS_MOCK_MODE=0'
    else:
        usage_msg = None

    parser = argparse.ArgumentParser(usage=usage_msg)
    subparsers = parser.add_subparsers(dest='command')
    subparsers.add_parser('generate-config', help='Generates an editable config in the current working directory.')
    subparsers.add_parser('generate-manifest', help='Generates an editable manifest in the current working directory.')
    subparsers.add_parser('generate', help='Generates a config and manifest in the current working directory.')
    # Run subparser                                                                                                              
    parser_run = subparsers.add_parser('run', help='Runs the ADAM/GATK pipeline')
    default_config = 'adam-gatk-mock.config' if mock_mode() else 'adam-gatk.config'
    default_manifest = 'adam-gatk-mock-manifest.csv' if mock_mode() else 'adam-gatk-manifest.csv'
    parser_run.add_argument('--config', default=default_config, type=str,
                            help='Path to the (filled in) config file, generated with "generate-config".')
    parser_run.add_argument('--manifest', default=default_manifest,
                            type=str, help='Path to the (filled in) manifest file, generated with "generate-manifest". '
                                           '\nDefault value: "%(default)s".')
    Job.Runner.addToilOptions(parser_run)
    args = parser.parse_args()

    cwd = os.getcwd()
    if args.command == 'generate-config' or args.command == 'generate':
        generate_file(os.path.join(cwd, default_config), generate_config)
    if args.command == 'generate-manifest' or args.command == 'generate':
        generate_file(os.path.join(cwd, default_manifest), generate_manifest)
    # Pipeline execution
    elif args.command == 'run':
        require(os.path.exists(args.config), '{} not found. Please run '
                                             'generate-config'.format(args.config))
        if not hasattr(args, 'sample'):
            require(os.path.exists(args.manifest), '{} not found and no samples provided. Please '
                                                   'run "generate-manifest"'.format(args.manifest))
        # Parse config
        parsed_config = {x.replace('-', '_'): y for x, y in yaml.load(open(args.config).read()).iteritems()}
        inputs = argparse.Namespace(**parsed_config)

        # Parse manifest file
        uuid_list = []
        with open(args.manifest) as f_manifest:
            for line in f_manifest:
                if not line.isspace() and not line.startswith('#'):
                    uuid_list.append(line.strip())

        inputs.sort = False
        if not inputs.dir_suffix:
            inputs.dir_suffix = ''
        if not inputs.s3_bucket:
            inputs.s3_bucket = ''

        if inputs.master_ip and inputs.num_nodes:
            raise ValueError("Exactly one of master_ip (%s) and num_nodes (%d) must be provided." %
                             (inputs.master_ip, inputs.num_nodes))

        if not hasattr(inputs, 'master_ip') and inputs.num_nodes <= 1:
            raise ValueError('num_nodes allocates one Spark/HDFS master and n-1 workers, and thus must be greater '
                             'than 1. %d was passed.' % inputs.num_nodes)

        if (inputs.pipeline_to_run != "adam" and
            inputs.pipeline_to_run != "gatk" and
            inputs.pipeline_to_run != "both"):
            raise ValueError("pipeline_to_run must be either 'adam', 'gatk', or 'both'. %s was passed." % inputs.pipeline_to_run)

        Job.Runner.startToil(Job.wrapJobFn(sample_loop, uuid_list, inputs), args)