def run(self, network, in_data, out_attributes, user_options, num_cores, outfile): import os from genomicode import filelib from genomicode import parselib from genomicode import alignlib from genomicode import config from genomicode import parallel log_filenames = _find_output_logs(in_data.identifier) assert log_filenames results = {} # dict of sample -> dictionary of output for filename in log_filenames: # <path>/<sample>.log path, file_ = os.path.split(filename) f, e = os.path.splitext(file_) assert e == ".log" sample = f results[sample] = alignlib.parse_bowtie1_output(filename) # Make table where the rows are the samples and the columns # are the statistics. all_samples = sorted(results) table = [] header = "Sample", "Aligned Reads", "Total Reads", "Perc Aligned" table.append(header) for sample in all_samples: stats = results[sample] total_reads = stats["reads_processed"] aligned_reads = stats["aligned_reads"] perc_aligned = float(aligned_reads) / total_reads * 100 x1 = parselib.pretty_int(aligned_reads) x2 = parselib.pretty_int(total_reads) x3 = "%.2f%%" % perc_aligned x = sample, x1, x2, x3 table.append(x) # Write out the table as text file. TXT_FILE = "summary.txt" handle = open(TXT_FILE, 'w') for x in table: print >> handle, "\t".join(x) handle.close() txt2xls = filelib.which_assert(config.txt2xls) os.system("%s -b %s > %s" % (parallel.quote(txt2xls), TXT_FILE, outfile))
def change_directory(cache_path, arg): import os from genomicode import jmath from genomicode import filelib from genomicode import parallel module_paths = _list_module_directories(cache_path) if jmath.is_int(arg): # Go to the ith most recent module_path i = int(arg) assert i > 0 assert i < len(module_paths), "There are only %d modules" % \ len(module_paths) desired_path = module_paths[i - 1] else: x = [x for x in module_paths if x.find(arg) >= 0] assert x, "I could not find path containing: %s" % arg desired_path = x[0] x = os.path.join(cache_path, desired_path) print "cd %s" % parallel.quote(x)
def run(self, network, in_data, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import parallel from Betsy import module_utils as mlib filenames = mlib.find_fastq_files(in_data.identifier) assert filenames, "FASTQ files not found: %s" % in_data.identifier filelib.safe_mkdir(out_path) metadata = {} fastqc = mlib.findbin("fastqc") fastqc_q = parallel.quote(fastqc) commands = [ "%s --outdir=%s --extract %s" % (fastqc_q, out_path, x) for x in filenames ] metadata["commands"] = commands metadata["num_cores"] = num_cores #commands = ["ls > %s" % x for x in filenames] parallel.pshell(commands, max_procs=num_cores) # Fastqc generates files: # <file>_fastqc/ # <file>_fastqc.zip # The contents of the .zip file are identical to the directories. # If this happens, then delete the .zip files because they are # redundant. files = os.listdir(out_path) filenames = [os.path.join(out_path, x) for x in files] for filename in filenames: zip_filename = "%s.zip" % filename if os.path.exists(zip_filename): os.unlink(zip_filename)
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import parallel from genomicode import hashlib from genomicode import filelib from genomicode import config from Betsy import module_utils bam_node, group_node = antecedents bam_path = module_utils.check_inpath(bam_node.identifier) sample_groups = module_utils.read_sample_group_file( group_node.identifier) # Get options. treat_sample = module_utils.get_user_option(user_options, "treatment_sample", not_empty=True) control_sample = module_utils.get_user_option(user_options, "control_sample") genome_size = module_utils.get_user_option(user_options, "macs_genome", not_empty=True) shiftsize = module_utils.get_user_option(user_options, "macs_shiftsize") if shiftsize: shiftsize = int(shiftsize) # Set the name. name = hashlib.hash_var(treat_sample) if control_sample: x = hashlib.hash_var(control_sample) name = "%s_vs_%s" % (treat_sample, x) # Make sure the samples exist. samples = [x[1] for x in sample_groups] assert treat_sample in samples, "Unknown sample: %s" % treat_sample if control_sample: assert control_sample in samples, \ "Unknown sample: %s" % control_sample # Find the BAM files. treat_filename = find_bam_file(bam_path, treat_sample, sample_groups) assert treat_filename, "Missing bam file for %s" % treat_sample control_filename = None if control_sample: control_filename = find_bam_file(bam_path, control_sample, sample_groups) assert control_filename, "Missing bam file for %s" % control_sample cmd = make_macs14_command(treat_filename, control_filename, name=name, genome_size=genome_size, shiftsize=shiftsize, save_bedgraph_file=True) parallel.sshell(cmd, path=out_path) # Run Rscript on the model, if one was generated. model_file = os.path.join(out_path, "%s_model.r" % name) if os.path.exists(model_file): Rscript = filelib.which_assert(config.Rscript) cmd = [parallel.quote(Rscript), model_file] parallel.sshell(cmd, path=out_path) files = [ "%s_peaks.xls" % name, "%s_summits.bed" % name, ] filenames = [os.path.join(out_path, x) for x in files] filelib.assert_exists_nz_many(filenames)
def run(self, network, in_data, out_attributes, user_options, num_cores, outfile): import os from genomicode import filelib from genomicode import parselib from genomicode import alignlib from genomicode import config from genomicode import parallel align_node = in_data x = filelib.list_files_in_path(align_node.identifier, endswith="align_summary.txt") align_filenames = x assert align_filenames, "Missing align_summary.txt" results = {} # dict of sample -> dictionary of output for filename in align_filenames: # Names must in the format: # <path>/<sample>.tophat/alignment_summary.txt # full_path <path>/<sample>.tophat # path <path> # tophat_dir <sample>.tophat # file_ accepted_hits.bam # sample <sample> full_path, file_ = os.path.split(filename) path, tophat_dir = os.path.split(full_path) assert file_ == "align_summary.txt" assert tophat_dir.endswith(".tophat") sample = tophat_dir[:-7] x = alignlib.parse_tophat_align_summary(filename) results[sample] = x # Make table where the rows are the samples and the columns # are the statistics. all_samples = sorted(results) table = [] header = "Sample", "Aligned Reads", "Total Reads", "Perc Aligned" table.append(header) for sample in all_samples: stats = results[sample] total_reads = stats["reads_processed"] aligned_reads = stats["aligned_reads"] perc_aligned = float(aligned_reads) / total_reads * 100 x1 = parselib.pretty_int(aligned_reads) x2 = parselib.pretty_int(total_reads) x3 = "%.2f%%" % perc_aligned x = sample, x1, x2, x3 table.append(x) # Write out the table as text file. TXT_FILE = "summary.txt" handle = open(TXT_FILE, 'w') for x in table: print >> handle, "\t".join(x) handle.close() txt2xls = filelib.which_assert(config.txt2xls) os.system("%s -b %s > %s" % (parallel.quote(txt2xls), TXT_FILE, outfile))
def sq(name): # quote for a shell command. from genomicode import parallel return parallel.quote(name)