Exemplo n.º 1
0
def submit_classify_job(job_config, cluster_id, dry_run, **kwargs):
    job_configuration = "config/classify_job.config"
    if job_config is not None and job_config.strip() != "":
        job_configuration = job_config.strip()

    config = configparser.ConfigParser()
    config.optionxform = str
    config.read(job_configuration)

    if cluster_id is None or cluster_id.strip() == "":
        cluster_id = utility.get_cluster_id(dry_run)
    else:
        cluster_id = cluster_id.strip()

    if cluster_id != "" and check_configuration(config):
        if config["job_config"].get("upload_classify_script", "False") == "True":
            utility.upload_files_to_s3([(config["job_config"]["classify_script"],
                                         config["job_config"]["classify_script_local_location"],
                                         config["job_config"]["classify_script_s3_location"])], dry_run)

        num_executors = calculate_num_executor(cluster_id, config["spark_config"]["executor_memory"])
        if num_executors < 0:
            config["spark_config"]["num_executors"] = "None"
        else:
            config["spark_config"]["num_executors"] = str(num_executors)

        config["spark_config"]["executor_cores"] = "1"

        job_argument = build_command(cluster_id, config, num_executors)

        if not dry_run:
            emr_client = boto3.client("emr")
            # warn user before removing any output
            out = config["script_arguments"]["output_location"]
            # find out which output dirs, if any, exist
            dirs_to_remove = utility.check_s3_path_exists([out])
            # create a list of the names of the directories to remove
            if dirs_to_remove:
                response = input("About to remove any existing output directories." +
                                 "\n\n\t{}\n\nProceed? [y/n]: ".format(
                                     '\n\n\t'.join(dirs_to_remove)))
                while response not in ['y', 'n']:
                    response = input('Proceed? [y/n]: ')
                if response == 'n':
                    print("Program Terminated.  Modify config file to change " +
                          "output directories.")
                    sys.exit(0)
                # remove the output directories
                if not utility.remove_s3_files(dirs_to_remove):
                    print("Program terminated")
                    sys.exit(1)
            job_submission = emr_client.add_job_flow_steps(**job_argument)
            print("Submitted job to cluster {}. Job id is {}".format(cluster_id, job_submission["StepIds"][0]))
        else:
            print(job_argument)
Exemplo n.º 2
0
def upload_files_to_s3(cfg, dry_run):
    """
    uploads files to aws s3 storage - and updates the configuration object with
    the details of the s3 files
    :param cfg: ConfigParser configuration object
    :param dry_run: flag to indicate if this is "dry run" or not
    :return: the configuration object
    """
    s3_upload_list = []

    section = "job_config"
    if cfg[section]["upload_script"] == "True":
        s3_upload_list.append((cfg[section]["script"],
                               cfg[section]["script_local_location"],
                               cfg[section]["script_s3_location"]))

    section = "user_script_config"
    if cfg[section]["upload_user_files"] == "True":
        # upload the compulsory user script
        s3_upload_list.append((cfg[section]["script"],
                               cfg[section]["user_files_local_location"],
                               cfg[section]["user_files_s3_location"]))

        # upload any optional user files
        if "supporting_files" in cfg[section]:
            for f in cfg[section]["supporting_files"].split(','):
                if f.strip() != "":
                    s3_upload_list.append((f.strip(), cfg[section]["user_files_local_location"],
                                           cfg[section]["user_files_s3_location"]))

    # call utility code to upload list of files to s3
    files = utility.upload_files_to_s3(s3_upload_list, dry_run)
    cfg["s3"] = {"files": files}

    return cfg
Exemplo n.º 3
0
def upload_files_to_s3(cfg, dry_run):
    """
    uploads files to aws s3 storage - and updates the configuration object with
    the details of the s3 files
    :param cfg: ConfigParser configuration object
    :param dry_run: flag to indicate if this is "dry run" or not
    :return: the configuration object
    """
    s3_upload_list = []

    section = "job_config"
    if cfg[section]["upload_script"] == "True":
        s3_upload_list.append((cfg[section]["script"],
                               cfg[section]["script_local_location"],
                               cfg[section]["script_s3_location"]))

    section = "user_script_config"
    if cfg[section]["upload_user_files"] == "True":
        # upload the compulsory user script
        s3_upload_list.append((cfg[section]["script"],
                               cfg[section]["user_files_local_location"],
                               cfg[section]["user_files_s3_location"]))

        # upload any optional user files
        if "supporting_files" in cfg[section]:
            for f in cfg[section]["supporting_files"].split(','):
                if f.strip() != "":
                    s3_upload_list.append((f.strip(), cfg[section]["user_files_local_location"],
                                           cfg[section]["user_files_s3_location"]))

    # call utility code to upload list of files to s3
    files = utility.upload_files_to_s3(s3_upload_list, dry_run)
    cfg["s3"] = {"files": files}

    return cfg
Exemplo n.º 4
0
    if parser_result.job_config is not None and parser_result.job_config.strip() != "":
        job_configuration = parser_result.job_config.strip()

    config = configparser.ConfigParser()
    config.optionxform = str
    config.read(job_configuration)

    if parser_result.cluster_id is None or parser_result.cluster_id.strip() == "":
        cluster_id = utility.get_cluster_id(parser_result.dry_run)
    else:
        cluster_id = parser_result.cluster_id.strip()

    if cluster_id != "" and check_configuration(config):
        if config["job_config"].get("upload_downloader_script", "False") == "True":
            utility.upload_files_to_s3([(config["job_config"]["downloader_script"],
                                         config["job_config"]["downloader_script_local_location"],
                                         config["job_config"]["downloader_script_s3_location"])], parser_result.dry_run)

        job_argument = build_command(config)

        if not parser_result.dry_run:
            emr_client = boto3.client("emr")
            # warn user before removing any output
            out = config["script_arguments"]["output_location"]
            rep = config["script_arguments"]["report_location"]
            # find out which output dirs, if any, exist
            dirs_to_remove = utility.check_s3_path_exists([out, rep])
            if dirs_to_remove:
                response = input("About to remove any existing output directories." +
                                 "\n\n\t{}\n\nProceed? [y/n]: ".format(
                                     '\n\n\t'.join(dirs_to_remove)))
Exemplo n.º 5
0
    if parser_result.job_config is not None and parser_result.job_config.strip() != "":
        job_configuration = parser_result.job_config.strip()

    config = configparser.ConfigParser()
    config.optionxform = str
    config.read(job_configuration)

    if parser_result.cluster_id is None or parser_result.cluster_id.strip() == "":
        cluster_id = utility.get_cluster_id(parser_result.dry_run)
    else:
        cluster_id = parser_result.cluster_id.strip()

    if cluster_id != "" and check_configuration(config):
        if config["job_config"].get("upload_analysis_script", "False") == "True":
            utility.upload_files_to_s3([(config["job_config"]["analysis_script"],
                                         config["job_config"]["analysis_script_local_location"],
                                         config["job_config"]["analysis_script_s3_location"])], parser_result.dry_run)

        num_executors = calculate_num_executor(cluster_id, config["spark_config"]["executor_memory"])
        if num_executors < 0:
            config["spark_config"]["num_executors"] = "None"
        else:
            config["spark_config"]["num_executors"] = str(num_executors)

        config["spark_config"]["executor_cores"] = "1"

        job_argument = build_command(config)

        if not parser_result.dry_run:
            emr_client = boto3.client("emr")
            # warn user before removing any output
Exemplo n.º 6
0
        action="store_true",
        dest="dry_run",
        help="Produce the configurations for the cluster to be created")
    parser_result = parser.parse_args()

    if parser_result.emr_config and parser_result.emr_config.strip() != "":
        emr_configuration = parser_result.emr_config

    config = configparser.ConfigParser()
    config.read(emr_configuration)

    if check_configuration(config):
        if config["EMR"].get("upload_bootstrap_scripts", "False") == "True":
            utility.upload_files_to_s3(
                [(bootstrap_script.strip(),
                  config["EMR"]["bootstrap_scripts_local_location"],
                  config["EMR"]["bootstrap_scripts_s3_location"])
                 for bootstrap_script in config["EMR"]
                 ["bootstrap_scripts"].split(",")], parser_result.dry_run)

        emr_argument = build_command(config)

        if not parser_result.dry_run:
            emr_client = boto3.client("emr")
            cluster_launch = emr_client.run_job_flow(**emr_argument)
            print("Cluster has been launched with ID",
                  cluster_launch["JobFlowId"])
        else:
            print("\n".join([
                "{} = {}".format(*emr_arg)
                for emr_arg in list(emr_argument.items())
            ]))
Exemplo n.º 7
0
if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Cluster launcher for spark-based RNA-seq Pipeline')
    parser.add_argument('--config', '-c', action="store", dest="emr_config", help="EMR configuration file")
    parser.add_argument('--dry-run', '-d', action="store_true", dest="dry_run",
                        help="Produce the configurations for the cluster to be created")
    parser_result = parser.parse_args()

    if parser_result.emr_config and parser_result.emr_config.strip() != "":
        emr_configuration = parser_result.emr_config

    config = configparser.ConfigParser()
    config.read(emr_configuration)

    if check_configuration(config):
        if config["EMR"].get("upload_bootstrap_scripts", "False") == "True":
            utility.upload_files_to_s3(
                [(bootstrap_script.strip(), config["EMR"]["bootstrap_scripts_local_location"],
                 config["EMR"]["bootstrap_scripts_s3_location"])
                 for bootstrap_script in config["EMR"]["bootstrap_scripts"].split(",")],
                parser_result.dry_run)

        emr_argument = build_command(config)

        if not parser_result.dry_run:
            emr_client = boto3.client("emr")
            cluster_launch = emr_client.run_job_flow(**emr_argument)
            print("Cluster has been launched with ID", cluster_launch["JobFlowId"])
        else:
            print("\n".join(["{} = {}".format(*emr_arg) for emr_arg in list(emr_argument.items())]))