Exemplo n.º 1
0
def submit_job(db_settings, row, raw_data, indices, threads, jobs_folder):
    """Generate and export job file to a specific folder"""
    kwargs = {
        "pair": ('pair' in row[0]),
        "dUTP": ('dUTP' in row[0]),
        "workflow": os.path.splitext(os.path.basename(row[1]))[0],
        "template": row[2],
        "uid": row[3],
        "genome_db": row[4],            # not used
        "genome": row[5],
        "annotation": row[6],           # not used
        "annottable": row[7],           # not used
        "spike": ('spike' in row[8]),
        "forcerun": (int(row[9]) == 1),
        "clip_5p_end": int(row[10]),
        "clip_3p_end": int(row[11]),
        "raw_data": raw_data,
        "indices": indices,
        "threads": threads
    }

    jobs_folder = jobs_folder if os.path.isabs(jobs_folder) else os.path.join(os.getcwd(), jobs_folder)

    # Setting values to fill in job template
    #  We always create both upstream and downstream, even if we gonna use only upstream
    kwargs["fastq_file_upstream"] = os.path.join(kwargs["raw_data"], kwargs["uid"], kwargs["uid"] + '.fastq')
    kwargs["fastq_file_downstream"] = os.path.join(kwargs["raw_data"], kwargs["uid"], kwargs["uid"] + '_2.fastq')
    kwargs["star_indices_folder"] = os.path.join(kwargs["indices"], STAR_INDICES, kwargs["genome"])
    # we need to add RIBO_SUFFIX to "genome" to get folder name for ribosomal bowtie indices
    kwargs["bowtie_indices_folder"] = os.path.join(kwargs["indices"], BOWTIE_INDICES, kwargs["genome"]+RIBO_SUFFIX)
    kwargs["chrom_length"] = os.path.join(kwargs["indices"], BOWTIE_INDICES, kwargs["genome"], CHR_LENGTH_GENERIC_TSV)
    kwargs["annotation_input_file"] = os.path.join(kwargs["indices"], ANNOTATIONS, kwargs["genome"],ANNOTATION_GENERIC_TSV)
    kwargs["exclude_chr"] = "control" if kwargs["spike"] else ""
    kwargs["output_folder"] = os.path.join(kwargs["raw_data"], kwargs["uid"])

    job_file_basename = kwargs["workflow"] + '-' + kwargs["uid"] + '.json'
    output_filename = os.path.join(jobs_folder, JOBS_NEW, job_file_basename)
    running_filename = os.path.join(jobs_folder, JOBS_RUNNING, job_file_basename)
    
    if not os.path.isfile(kwargs["fastq_file_upstream"]) or (kwargs['pair'] and not os.path.isfile(kwargs["fastq_file_downstream"])):
        raise BiowFileNotFoundException(kwargs["uid"])

    filled_job_object = remove_not_set_inputs(json.loads(kwargs['template'].replace('\n', ' ').format(**kwargs).replace("'True'",'true').replace("'False'",'false').replace('"True"','true').replace('"False"','false')))
    filled_job_str = json.dumps(collections.OrderedDict(sorted(filled_job_object.items())),indent=4)

    # Check if file exists in running or new job folder
    if os.path.isfile(output_filename) or os.path.isfile(running_filename):
        raise BiowJobException(kwargs['uid'], message="Duplicate job file [{}]. It has been already created".format(job_file_basename))

    try:
        with open(output_filename, 'w') as output_file:
            output_file.write(filled_job_str)
    except Exception as ex:
        raise BiowJobException(kwargs['uid'], message="Failed to write job file: "+str(ex))
def check_job(db_settings, uid, workflow, jobs_folder):
    """Check status for current job from Airflow DB"""
    tasks, total = get_tasks(uid, db_settings)
    tasks = {k: v for k, v in tasks.iteritems() if v}
    if not tasks:
        failed_file = os.path.join(
            jobs_folder, JOBS_FAIL,
            os.path.splitext(os.path.basename(workflow))[0] + '-' + uid +
            '.json')
        if os.path.isfile(
                failed_file
        ):  # If job file was moved to failed folder before even started
            raise BiowJobException(
                uid, message="Job file is already marked as failed one")
        return None, None
    if tasks.get("failed"):
        raise BiowWorkflowException(uid, message=tasks)
    elif total > 0 and len(tasks.get(
            "success", [])) == total:  # All the tasks exit with success
        return LIBSTATUS["SUCCESS_PROCESS"], "Complete"
    else:
        percent_complete = 0
        try:
            percent_complete = int(
                float(len(tasks.get("success", []))) / total * 100)
        except ZeroDivisionError:
            pass
        return LIBSTATUS["PROCESSING"], "Processing: " + str(
            percent_complete) + "%"
def raise_if_dag_exists(uid, db_settings):
    """Raise if DAG run with the dag_id already exists"""
    try:
        raise_if_dag_absent(uid, db_settings)
    except BiowBasicException:
        pass
    else:
        raise BiowJobException(uid, message='Duplicate dag_id. Use ForceRun')
def raise_if_file_exists(uid, filename):
    try:
        raise_if_file_absent(uid, filename)
    except BiowBasicException:
        pass
    else:
        raise BiowJobException(
            uid, message="File already exists {0}".format(filename))
def raise_if_table_exists(db_settings, uid, table, db):
    try:
        raise_if_table_absent(db_settings, uid, table, db)
    except BiowBasicException:
        pass
    else:
        raise BiowJobException(uid,
                               message="Table {0}.{1} already exists".format(
                                   db, table))
def submit_job(db_settings, row, raw_data, indices, threads, jobs_folder):
    """Generate and export job file to a specific folder"""

    kwargs = {
        "pair": ('pair' in row[0]),
        "workflow": os.path.splitext(os.path.basename(row[1]))[0],
        "template": row[2],
        "genome_db": row[3],
        "genome": row[4],
        "uid": row[6],
        "exp_fragment_size": int(row[7]),
        "force_fragment_size": (int(row[8]) == 1),
        "forcerun": (int(row[9]) == 1),
        "clip_5p_end": int(row[10]),
        "clip_3p_end": int(row[11]),
        "broad_peak": (int(row[12]) == 2),
        "remove_duplicates": (int(row[13]) == 1),
        "genome_size": row[14],
        "control_id": row[15],
        "raw_data": raw_data,
        "indices": indices,
        "threads": threads
    }

    jobs_folder = jobs_folder if os.path.isabs(jobs_folder) else os.path.join(
        os.getcwd(), jobs_folder)

    #  We always create both upstream and downstream, even if we gonna use only upstream
    kwargs["fastq_file_upstream"] = os.path.join(kwargs["raw_data"],
                                                 kwargs["uid"],
                                                 kwargs["uid"] + '.fastq')
    kwargs["fastq_file_downstream"] = os.path.join(kwargs["raw_data"],
                                                   kwargs["uid"],
                                                   kwargs["uid"] + '_2.fastq')
    kwargs["bowtie_indices_folder"] = os.path.join(kwargs["indices"],
                                                   BOWTIE_INDICES,
                                                   kwargs["genome"])
    kwargs["chrom_length"] = os.path.join(kwargs["indices"], BOWTIE_INDICES,
                                          kwargs["genome"],
                                          CHR_LENGTH_GENERIC_TSV)
    kwargs["annotation_input_file"] = os.path.join(kwargs["indices"],
                                                   ANNOTATIONS,
                                                   kwargs["genome"],
                                                   ANNOTATION_GENERIC_TSV)
    kwargs["output_folder"] = os.path.join(kwargs["raw_data"], kwargs["uid"])

    output_filename = os.path.join(
        jobs_folder, JOBS_NEW,
        kwargs["workflow"] + '-' + kwargs["uid"] + '.json')
    running_filename = os.path.join(
        jobs_folder, JOBS_RUNNING,
        kwargs["workflow"] + '-' + kwargs["uid"] + '.json')

    try:
        kwargs["control_file"] = get_control(db_settings, **kwargs)
    except BiowFileNotFoundException:
        raise

    if not os.path.isfile(kwargs["fastq_file_upstream"]) or (
            kwargs['pair']
            and not os.path.isfile(kwargs["fastq_file_downstream"])):
        raise BiowFileNotFoundException(kwargs["uid"])

    filled_job_object = remove_not_set_inputs(
        json.loads(kwargs['template'].replace(
            '\n', ' ').format(**kwargs).replace("'True'", 'true').replace(
                "'False'",
                'false').replace('"True"', 'true').replace('"False"',
                                                           'false')))
    filled_job_str = json.dumps(collections.OrderedDict(
        sorted(filled_job_object.items())),
                                indent=4)

    # Check if file exists in job folder (running or new)
    if os.path.isfile(output_filename) or os.path.isfile(running_filename):
        raise BiowJobException(
            kwargs['uid'],
            message="Duplicate job file. It has been already created")

    try:
        with open(output_filename, 'w') as output_file:
            output_file.write(filled_job_str)
    except Exception as ex:
        raise BiowJobException(kwargs['uid'],
                               message="Failed to write job file: " + str(ex))
def raise_if_dag_absent(uid, db_settings):
    if not get_last_dag_id(uid, db_settings):
        raise BiowJobException(uid, message='DAG is not found')