Exemplo n.º 1
0
 def setUp(self):
     # Find out the current directory
     self.miso_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
     self.tests_data_dir = os.path.join(self.miso_path, "test-data")
     
     #Read the python executable name from settings
     self.settings = load_settings(None)
     self.python_executable = Settings.get_python_executable()
     self.events_analysis_cmd = "%s %s " %(self.python_executable, os.path.join(self.miso_path,
                                                            "run_events_analysis.py"))
     self.tests_output_dir = os.path.join(self.miso_path, "test-output")
     self.test_sam_filename = os.path.join(self.tests_data_dir,
                                           "sam-data",
                                           "c2c12.Atp2b1.sam")
     self.gff_events_dir = os.path.join(self.miso_path, "gff-events")
     self.sam_to_bam_script = os.path.join(self.miso_path, "sam_to_bam.py")
     self.index_gff_script = os.path.join(self.miso_path, "index_gff.py")
Exemplo n.º 2
0
def compute_all_genes_psi(gff_dir, bam_filename, read_len, output_dir,
                          use_cluster=False, chunk_jobs=200,
                          overhang_len=1, paired_end=None,
                          settings=None):
    """
    Compute Psi values for genes using a GFF and a BAM filename.
    """
    gene_ids_to_gff_index = gff_utils.get_gene_ids_to_gff_index(gff_dir)

    num_genes = len(gene_ids_to_gff_index.keys())

    miso_run = os.path.join(miso_path, "run_miso.py")
    
    miso_settings = Settings.load(settings)
    python_executable = Settings.get_python_executable()
    
    print "Computing gene-level Psi for %d genes..." \
          %(num_genes)
    print "  - GFF index: %s" %(gff_dir)
    print "  - BAM: %s" %(bam_filename)
    print "  - Read length: %d" %(read_len)
    print "  - Output directory: %s" %(output_dir)

    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)
    
    # All commands to run
    all_miso_cmds = []

    for gene_id, gff_index_filename in gene_ids_to_gff_index.iteritems():
        miso_cmd = "%s %s --compute-gene-psi \"%s\" \"%s\" %s %s --read-len %d " \
                   %(python_executable, miso_run, gene_id, gff_index_filename, bam_filename, output_dir,
                     read_len)
        
        if paired_end != None:
            # Run in paired-end mode
            frag_mean = float(paired_end[0])
            frag_sd = float(paired_end[1])
            miso_cmd += " --paired-end %.1f %.1f" %(frag_mean, frag_sd)
        else:
            miso_cmd += " --overhang-len %d" %(overhang_len)

        # Add settings filename if given
        if settings != None:
            miso_cmd += " --settings-filename %s" %(settings)

        if use_cluster:
            # If asked to use cluster, accumulate the MISO commands
            # but do not run them
            all_miso_cmds.append(miso_cmd)
        else:
            print "  - Executing: %s" %(miso_cmd)
            os.system(miso_cmd)


    if use_cluster:
        # Threshold for putting jobs in the long queue
        long_thresh = 50

        # Delay between jobs
        delay_constant = 0.9
        
        # Invoke the commands using the cluster
        print "Sending %d genes to be run on cluster in chunks of %d..." \
              %(num_genes, chunk_jobs)

        if not chunk_jobs:
            print "  - Using default chunk jobs = %d" %(200)
            chunk_jobs = 200

	chunk_jobs = max(1, int(round(num_genes / float(chunk_jobs))))

        # Split the gene records into batches
	cmd_batches = cluster_utils.chunk_list(all_miso_cmds, chunk_jobs)

        time_str = time.strftime("%m-%d-%y_%H:%M:%S")

        for batch_num, batch in enumerate(cmd_batches):
            batch_size = len(batch)
            print "Running batch %d (batch size = %d)" %(batch_num,
                                                         batch_size)

            if batch_size >= long_thresh:
                queue_type = "long"
            else:
                queue_type = "short"
            
            # Pool all the MISO commands belonging to this batch
            batch_logs_dir = os.path.join(output_dir, "batch-logs")
            if not os.path.isdir(batch_logs_dir):
                os.makedirs(batch_logs_dir)
            batch_logfile = os.path.join(batch_logs_dir,
                                         "batch-%d-%s.log" %(batch_num,
                                                             time_str))
            redirected_output = " >> %s;\n" %(batch_logfile)
            cmd_to_run = redirected_output.join(batch)

            # Run on cluster
            job_name = "gene_psi_batch_%d" %(batch_num)
            cluster_utils.run_on_cluster(cmd_to_run, job_name, output_dir,
                                         queue_type=queue_type,
                                         settings=settings)
            time.sleep(delay_constant)
Exemplo n.º 3
0
def run_two_iso_on_cluster(miso_path, events_filename, event_type, psi_outdir,
                           read_len, overhang_len, chunk_jobs=False):
    """
    Run two-isoform MISO on cluster.

    - chunk_jobs: the number of jobs in each batch.  All jobs in a batch will be assigned to the same processor on
      the cluster.  When chunk_jobs is not specified, each event gets sent as a separate job.
    """
    print "Running two isoform MISO on cluster..."
    #Load python executable name
    settings = Settings.get()
    python_executable = Settings.get_python_executable()
    # Load two-isoform events
    miso_events = as_events.MISOEvents(2, event_type, from_file=events_filename)
    num_total_events = len(miso_events.events)
    delay_constant = 0.9

    if not chunk_jobs:
	event_batches = [miso_events.events]
    else:
	# Make sure we're chunking into more than one batch of jobs
	assert(chunk_jobs > 1)

        # Compute number of chunks we'd need to split all events to in order to get
	# 'chunk_jobs'-many events in a job
	chunk_jobs = int(round(num_total_events / float(chunk_jobs)))
	print "Splitting events into %d chunks..." %(chunk_jobs)
	event_names = miso_events.events.keys()
	event_batches = cluster_utils.chunk_list(event_names, chunk_jobs)
	print "  - Total of %d event batches." %(len(event_batches))

    batch_lens = [len(batch) for batch in event_batches]
    max_events_per_batch = max(batch_lens)
    queue_thresh = 50
    num_batches = len(event_batches)
    long_batch = 100
    
    if max_events_per_batch >= queue_thresh and max_events_per_batch <= long_batch:
	print "Longest batch contains more than %d jobs -- changing queue type to short" \
              %(queue_thresh)
	queue_type = 'short'
    else:
        print "Longest batch contains more than %d jobs -- changing queue type to long" \
              %(long_batch)
        queue_type = 'long'

    for event_batch in event_batches:
        # Compile set of commands that will be run in the same job
	miso_event_cmd_list = []
	num_jobs_per_batch = len(event_batch)
	print "Processing a batch of size %d events" %(num_jobs_per_batch)
	for event_name in event_batch:
	    miso_event_cmd = '%s %s --run-two-iso-event \"%s\" %s %s --event-type %s --read-len %d --overhang-len %d' \
			     %(python_executable,
                   os.path.join(miso_path, 'run_miso.py'),
			       event_name,
			       events_filename,
			       psi_outdir,
			       event_type,
			       read_len,
			       overhang_len)
	    miso_event_cmd_list.append(miso_event_cmd)
	# Execute events in batch
	miso_event_batch_cmd = "; ".join(miso_event_cmd_list)
	#print "Executing batched command list: ", miso_event_batch_cmd
	if num_batches > 1:
	    event_name += "_batch"
	cluster_utils.run_on_cluster(miso_event_batch_cmd, event_name, psi_outdir,
                                     queue_type=queue_type)
	# Add pause to allow cluster to process jobs
	time.sleep(delay_constant)
    # Parse all events into genes
    events_to_genes = miso_events.loaded_events_to_genes(read_len=read_len,
                                                         overhang_len=overhang_len)
Exemplo n.º 4
0
def compute_psi(sample_filenames, output_dir, event_type, read_len, overhang_len,
		use_cluster=False, chunk_jobs=False, filter_events=True,
                events_info_filename=None, settings_filename=None):
    """
    Compute Psi values for skipped exons.  Sample filenames is a mapping from
    sample label to sample.

      - sample_filenames = [[sample_label1, sample_filename1],
                            [sample_label2, sample_filename2]]
      - output_dir: output directory
      - event_type: 'SE', 'RI', etc.
    """
    if not os.path.isdir(output_dir):
	os.makedirs(output_dir)
	
    output_dir = os.path.join(output_dir, event_type)
    output_dir = os.path.abspath(output_dir)
    if not os.path.isdir(output_dir):
	os.makedirs(output_dir)
	
    print "Computing Psi for events of type %s" %(event_type)
    print "  - samples used: ", sample_filenames.keys()
    if not os.path.isdir(output_dir):
        os.makedirs(output_dir)

    for sample_label, sample_filename in sample_filenames.iteritems():
	print "Processing sample: label=%s, filename=%s" %(sample_label, sample_filename)
	results_output_dir = os.path.join(output_dir, sample_label)
        if not os.path.isdir(results_output_dir):
            os.makedirs(results_output_dir)

	# Load the set of counts and serialize them into JSON
	events = as_events.load_event_counts(sample_filename, event_type,
                                             events_info_filename=events_info_filename)

	# Filter events
	if filter_events:
	    print "Filtering events..."
	    events.filter_events(settings=Settings.get())

	print "Running on a total of %d events." %(len(events.events))
	    
	events_filename = events.output_file(results_output_dir, sample_label)
	
	# Run MISO on them
	miso_cmd = '%s %s --compute-two-iso-psi %s %s --event-type %s --read-len %d --overhang-len %d ' \
                   %(Settings.get_python_executable(), os.path.join(miso_path, 'run_miso.py'),
                     events_filename,
                     results_output_dir,
                     event_type,
                     read_len,
                     overhang_len)
	if use_cluster:
	    if chunk_jobs:
		miso_cmd += ' --use-cluster --chunk-jobs %d' %(chunk_jobs)
	    else:
		miso_cmd += ' --use-cluster'
        print "Executing: %s" %(miso_cmd)
	if use_cluster:
	    print " - Using cluster"
	os.system(miso_cmd)