Пример #1
0
def compute_all_genes_psi(gff_dir, bam_filename, read_len, output_dir,
                          use_cluster=False, chunk_jobs=200,
                          overhang_len=1, paired_end=None,
                          settings=None):
    """
    Compute Psi values for genes using a GFF and a BAM filename.
    """
    gene_ids_to_gff_index = gff_utils.get_gene_ids_to_gff_index(gff_dir)

    num_genes = len(gene_ids_to_gff_index.keys())

    miso_run = os.path.join(miso_path, "run_miso.py")

    print "Computing gene-level Psi for %d genes..." \
          %(num_genes)
    print "  - GFF index: %s" %(gff_dir)
    print "  - BAM: %s" %(bam_filename)
    print "  - Read length: %d" %(read_len)
    print "  - Output directory: %s" %(output_dir)

    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    
    # All commands to run
    all_miso_cmds = []

    for gene_id, gff_index_filename in gene_ids_to_gff_index.iteritems():
        miso_cmd = "python %s --compute-gene-psi \"%s\" \"%s\" %s %s --read-len %d " \
                   %(miso_run, gene_id, gff_index_filename, bam_filename, output_dir,
                     read_len)
        
        if paired_end != None:
            # Run in paired-end mode
            frag_mean = float(paired_end[0])
            frag_sd = float(paired_end[1])
            miso_cmd += " --paired-end %.1f %.1f" %(frag_mean, frag_sd)
        else:
            miso_cmd += " --overhang-len %d" %(overhang_len)

        # Add settings filename if given
        if settings != None:
            miso_cmd += " --settings-filename %s" %(settings)

        if use_cluster:
            # If asked to use cluster, accumulate the MISO commands
            # but do not run them
            all_miso_cmds.append(miso_cmd)
        else:
            print "  - Executing: %s" %(miso_cmd)
            os.system(miso_cmd)

    miso_settings = Settings.load(settings)

    if use_cluster:
        # Threshold for putting jobs in the long queue
        long_thresh = 50

        # Delay between jobs
        delay_constant = 0.9
        
        # Invoke the commands using the cluster
        print "Sending %d genes to be run on cluster in chunks of %d..." \
              %(num_genes, chunk_jobs)

        if not chunk_jobs:
            print "  - Using default chunk jobs = %d" %(200)
            chunk_jobs = 200

	chunk_jobs = max(1, int(round(num_genes / float(chunk_jobs))))

        # Split the gene records into batches
	cmd_batches = cluster_utils.chunk_list(all_miso_cmds, chunk_jobs)

        time_str = time.strftime("%m-%d-%y_%H:%M:%S")

        for batch_num, batch in enumerate(cmd_batches):
            batch_size = len(batch)
            print "Running batch %d (batch size = %d)" %(batch_num,
                                                         batch_size)

            if batch_size >= long_thresh:
                queue_type = "long"
            else:
                queue_type = "short"
            
            # Pool all the MISO commands belonging to this batch
            batch_logs_dir = os.path.join(output_dir, "batch-logs")
            if not os.path.isdir(batch_logs_dir):
                os.makedirs(batch_logs_dir)
            batch_logfile = os.path.join(batch_logs_dir,
                                         "batch-%d-%s.log" %(batch_num,
                                                             time_str))
            redirected_output = " >> %s;\n" %(batch_logfile)
            cmd_to_run = redirected_output.join(batch)

            # Run on cluster
            job_name = "gene_psi_batch_%d" %(batch_num)
            cluster_utils.run_on_cluster(cmd_to_run, job_name, output_dir,
                                         queue_type=queue_type,
                                         settings=settings)
            time.sleep(delay_constant)
Пример #2
0
def compute_all_genes_psi(gff_dir,
                          bam_filename,
                          read_len,
                          output_dir,
                          use_cluster=False,
                          chunk_jobs=200,
                          overhang_len=1,
                          paired_end=None,
                          settings=None):
    """
    Compute Psi values for genes using a GFF and a BAM filename.
    """
    gene_ids_to_gff_index = gff_utils.get_gene_ids_to_gff_index(gff_dir)

    num_genes = len(gene_ids_to_gff_index.keys())

    miso_run = os.path.join(miso_path, "run_miso.py")

    print "Computing gene-level Psi for %d genes..." \
          %(num_genes)
    print "  - GFF index: %s" % (gff_dir)
    print "  - BAM: %s" % (bam_filename)
    print "  - Read length: %d" % (read_len)
    print "  - Output directory: %s" % (output_dir)

    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    # All commands to run
    all_miso_cmds = []

    for gene_id, gff_index_filename in gene_ids_to_gff_index.iteritems():
        miso_cmd = "python %s --compute-gene-psi \"%s\" \"%s\" %s %s --read-len %d " \
                   %(miso_run, gene_id, gff_index_filename, bam_filename, output_dir,
                     read_len)

        if paired_end != None:
            # Run in paired-end mode
            frag_mean = float(paired_end[0])
            frag_sd = float(paired_end[1])
            miso_cmd += " --paired-end %.1f %.1f" % (frag_mean, frag_sd)
        else:
            miso_cmd += " --overhang-len %d" % (overhang_len)

        # Add settings filename if given
        if settings != None:
            miso_cmd += " --settings-filename %s" % (settings)

        if use_cluster:
            # If asked to use cluster, accumulate the MISO commands
            # but do not run them
            all_miso_cmds.append(miso_cmd)
        else:
            print "  - Executing: %s" % (miso_cmd)
            os.system(miso_cmd)

    miso_settings = Settings.load(settings)

    if use_cluster:
        # Threshold for putting jobs in the long queue
        long_thresh = 50

        # Delay between jobs
        delay_constant = 0.9

        # Invoke the commands using the cluster
        print "Sending %d genes to be run on cluster in chunks of %d..." \
              %(num_genes, chunk_jobs)

        if not chunk_jobs:
            print "  - Using default chunk jobs = %d" % (200)
            chunk_jobs = 200

        chunk_jobs = max(1, int(round(num_genes / float(chunk_jobs))))

        # Split the gene records into batches
        cmd_batches = cluster_utils.chunk_list(all_miso_cmds, chunk_jobs)

        time_str = time.strftime("%m-%d-%y_%H:%M:%S")

        for batch_num, batch in enumerate(cmd_batches):
            batch_size = len(batch)
            print "Running batch %d (batch size = %d)" % (batch_num,
                                                          batch_size)

            if batch_size >= long_thresh:
                queue_type = "long"
            else:
                queue_type = "short"

            # Pool all the MISO commands belonging to this batch
            batch_logs_dir = os.path.join(output_dir, "batch-logs")
            if not os.path.isdir(batch_logs_dir):
                os.makedirs(batch_logs_dir)
            batch_logfile = os.path.join(
                batch_logs_dir, "batch-%d-%s.log" % (batch_num, time_str))
            redirected_output = " >> %s;\n" % (batch_logfile)
            cmd_to_run = redirected_output.join(batch)

            # Run on cluster
            job_name = "gene_psi_batch_%d" % (batch_num)
            cluster_utils.run_on_cluster(cmd_to_run,
                                         job_name,
                                         output_dir,
                                         queue_type=queue_type,
                                         settings=settings)
            time.sleep(delay_constant)
Пример #3
0
def run_two_iso_on_cluster(miso_path, events_filename, event_type, psi_outdir,
                           read_len, overhang_len, chunk_jobs=False):
    """
    Run two-isoform MISO on cluster.

    - chunk_jobs: the number of jobs in each batch.  All jobs in a batch will be assigned to the same processor on
      the cluster.  When chunk_jobs is not specified, each event gets sent as a separate job.
    """
    print "Running two isoform MISO on cluster..."
    # Load two-isoform events
    miso_events = as_events.MISOEvents(2, event_type, from_file=events_filename)
    num_total_events = len(miso_events.events)
    delay_constant = 0.9

    if not chunk_jobs:
	event_batches = [miso_events.events]
    else:
	# Make sure we're chunking into more than one batch of jobs
	assert(chunk_jobs > 1)

        # Compute number of chunks we'd need to split all events to in order to get
	# 'chunk_jobs'-many events in a job
	chunk_jobs = int(round(num_total_events / float(chunk_jobs)))
	print "Splitting events into %d chunks..." %(chunk_jobs)
	event_names = miso_events.events.keys()
	event_batches = cluster_utils.chunk_list(event_names, chunk_jobs)
	print "  - Total of %d event batches." %(len(event_batches))

    batch_lens = [len(batch) for batch in event_batches]
    max_events_per_batch = max(batch_lens)
    queue_thresh = 50
    num_batches = len(event_batches)
    long_batch = 100
    
    if max_events_per_batch >= queue_thresh and max_events_per_batch <= long_batch:
	print "Longest batch contains more than %d jobs -- changing queue type to short" \
              %(queue_thresh)
	queue_type = 'short'
    else:
        print "Longest batch contains more than %d jobs -- changing queue type to long" \
              %(long_batch)
        queue_type = 'long'

    for event_batch in event_batches:
        # Compile set of commands that will be run in the same job
	miso_event_cmd_list = []
	num_jobs_per_batch = len(event_batch)
	print "Processing a batch of size %d events" %(num_jobs_per_batch)
	for event_name in event_batch:
	    miso_event_cmd = '%s %s --run-two-iso-event \"%s\" %s %s --event-type %s --read-len %d --overhang-len %d' \
			     %(python_exe,
                               os.path.join(miso_path, 'run_miso.py'),
			       event_name,
			       events_filename,
			       psi_outdir,
			       event_type,
			       read_len,
			       overhang_len)
	    miso_event_cmd_list.append(miso_event_cmd)
	# Execute events in batch
	miso_event_batch_cmd = "; ".join(miso_event_cmd_list)
	#print "Executing batched command list: ", miso_event_batch_cmd
	if num_batches > 1:
	    event_name += "_batch"
	cluster_utils.run_on_cluster(miso_event_batch_cmd, event_name, psi_outdir,
                                     queue_type=queue_type)
	# Add pause to allow cluster to process jobs
	time.sleep(delay_constant)
    # Parse all events into genes
    events_to_genes = miso_events.loaded_events_to_genes(read_len=read_len,
                                                         overhang_len=overhang_len)
Пример #4
0
def run_two_iso_on_cluster(miso_path,
                           events_filename,
                           event_type,
                           psi_outdir,
                           read_len,
                           overhang_len,
                           chunk_jobs=False):
    """
    Run two-isoform MISO on cluster.

    - chunk_jobs: the number of jobs in each batch.  All jobs in a batch will be assigned to the same processor on
      the cluster.  When chunk_jobs is not specified, each event gets sent as a separate job.
    """
    print "Running two isoform MISO on cluster..."
    # Load two-isoform events
    miso_events = as_events.MISOEvents(2,
                                       event_type,
                                       from_file=events_filename)
    num_total_events = len(miso_events.events)
    delay_constant = 0.9

    if not chunk_jobs:
        event_batches = [miso_events.events]
    else:
        # Make sure we're chunking into more than one batch of jobs
        assert (chunk_jobs > 1)

        # Compute number of chunks we'd need to split all events to in order to get
        # 'chunk_jobs'-many events in a job
        chunk_jobs = int(round(num_total_events / float(chunk_jobs)))
        print "Splitting events into %d chunks..." % (chunk_jobs)
        event_names = miso_events.events.keys()
        event_batches = cluster_utils.chunk_list(event_names, chunk_jobs)
        print "  - Total of %d event batches." % (len(event_batches))

    batch_lens = [len(batch) for batch in event_batches]
    max_events_per_batch = max(batch_lens)
    queue_thresh = 50
    num_batches = len(event_batches)
    long_batch = 100

    if max_events_per_batch >= queue_thresh and max_events_per_batch <= long_batch:
        print "Longest batch contains more than %d jobs -- changing queue type to short" \
                     %(queue_thresh)
        queue_type = 'short'
    else:
        print "Longest batch contains more than %d jobs -- changing queue type to long" \
              %(long_batch)
        queue_type = 'long'

    for event_batch in event_batches:
        # Compile set of commands that will be run in the same job
        miso_event_cmd_list = []
        num_jobs_per_batch = len(event_batch)
        print "Processing a batch of size %d events" % (num_jobs_per_batch)
        for event_name in event_batch:
            miso_event_cmd = 'python %s --run-two-iso-event \"%s\" %s %s --event-type %s --read-len %d --overhang-len %d' \
               %(os.path.join(miso_path, 'run_miso.py'),
                 event_name,
                 events_filename,
                 psi_outdir,
                 event_type,
                 read_len,
                 overhang_len)
            miso_event_cmd_list.append(miso_event_cmd)
        # Execute events in batch
        miso_event_batch_cmd = "; ".join(miso_event_cmd_list)
        #print "Executing batched command list: ", miso_event_batch_cmd
        if num_batches > 1:
            event_name += "_batch"
        cluster_utils.run_on_cluster(miso_event_batch_cmd,
                                     event_name,
                                     psi_outdir,
                                     queue_type=queue_type)
        # Add pause to allow cluster to process jobs
        time.sleep(delay_constant)
    # Parse all events into genes
    events_to_genes = miso_events.loaded_events_to_genes(
        read_len=read_len, overhang_len=overhang_len)