def is_format(locator_str, hrows=None, hcols=None): from genomicode import filelib import util if hrows not in [None, 1]: return False if hcols not in [None, 4]: return False if not filelib.exists(locator_str): # This will only work if locator_str is a string. return False # Read 5 lines and check the headers. If the file is small, this # may contain fewer than 5 lines. handle = filelib.openfh(locator_str) lines = [handle.readline() for i in range(5)] handle.close() # need to close it properly, or gunzip might not die. lines = [x for x in lines if x] matrix = [line.rstrip("\r\n").split("\t") for line in lines] # Make sure there's at least 1 line. if not matrix: return False header = matrix[0] if header[:len(ROW_HEADERS)] != ROW_HEADERS: return False # Check if there's extraneous stuff. nr, nc = util.num_headers(matrix) if nc > 4: return False return True
def is_format(locator_str, hrows=None, hcols=None): from genomicode import filelib if not filelib.exists(locator_str): return False # Read 5 lines and count the headers. handle = filelib.openfh(locator_str) lines = [handle.readline() for i in range(5)] handle.close() # need to close it properly, or gunzip might not die. lines = [x for x in lines if x] matrix = [line.rstrip("\r\n").split("\t") for line in lines] if len(matrix) < 3: return False # Line 3 should contain only 1 column. if len(matrix[2]) != 1: return False # Line 1 contains 1 more column than line 2. if len(matrix[0]) != len(matrix[1]) + 1: return False if len(matrix[0]) < 2: return False x = [x.upper() for x in matrix[0][:2]] if sorted(x) != sorted(["ACCESSION", "DESCRIPTION"]): return False return True
def is_format(locator_str, hrows=None, hcols=None): from genomicode import filelib if not filelib.exists(locator_str): return False if hrows not in [None, 1]: return False if hcols not in [None, 2]: return False # Read 5 lines and check the headers. handle = filelib.openfh(locator_str) lines = [handle.readline() for i in range(5)] handle.close() # need to close it properly, or gunzip might not die. lines = [x for x in lines if x] matrix = [line.rstrip("\r\n").split("\t") for line in lines] if len(matrix) < 3: return False # First line could be just one column, or could be many columns. if len(matrix[0]) < 1: return False # Second line must have at least 2 columns. if len(matrix[1]) < 2: return False if matrix[0][0] != "#1.2": return False #if matrix[2][0].strip().upper() != "NAME": # return False #if matrix[2][1].strip().upper() != "DESCRIPTION": # return False return True
def is_format(locator_str, hrows=None, hcols=None): from genomicode import filelib if not filelib.exists(locator_str): return False # Read 5 lines and check the headers. If the file is small, this # may contain fewer than 5 lines. handle = filelib.openfh(locator_str) lines = [handle.readline() for i in range(5)] handle.close() # need to close it properly, or gunzip might not die. lines = [x for x in lines if x] matrix = [line.rstrip("\r\n").split("\t") for line in lines] matrix = _clean_tdf(matrix) # Make sure there's at least 1 line. if not matrix: return False # All rows should contain at least one column. for x in matrix: if not x: return False # All rows should contain the same number of columns. for x in matrix: if len(x) != len(matrix[0]): return False return True
def is_format(locator_str, hrows=None, hcols=None): from genomicode import filelib if not filelib.exists(locator_str): return False handle = filelib.openfh(locator_str) x = handle.readline() handle.close() # need to close it properly, or gunzip might not die. if not x: # blank file return False if "," in x: return True return False
def is_format(locator_str, hrows=None, hcols=None): from genomicode import filelib import util if not filelib.exists(locator_str): return False # Read 5 lines and count the headers. # Actually, sometimes 5 lines not enough. Working on matrix with # 13 lines of header. handle = filelib.openfh(locator_str) lines = [handle.readline() for i in range(20)] handle.close() # need to close it properly, or gunzip might not die. lines = [x for x in lines if x] matrix = [line.rstrip("\r\n").split("\t") for line in lines] # Make sure there's at least 1 line. if not matrix: return False # All rows should contain the same number of columns. for cols in matrix: if len(cols) != len(matrix[0]): return False nr, nc = util.num_headers(matrix) nrow = hrows or nr ncol = hcols or nc if nrow < 1 or nrow > 4: return False if ncol < 1 or ncol > 5: return False header_def = [ (0, 0, "GID"), (0, 2, "NAME"), (0, 3, "GWEIGHT"), (0, 4, "GORDER"), (1, 0, "AID"), (2, 0, "EWEIGHT"), (3, 0, "EORDER"), ] for row, col, name in header_def: if nrow > row and ncol > col: if matrix[row][col].strip().upper() != name: return False return True
def is_format(locator_str, hrows=None, hcols=None): from genomicode import filelib import util if not filelib.exists(locator_str): return False # Read NUM_LINES lines and count the headers. Previously, we read # only 5 lines, and had problems. In a matrix, one of the # annotation columns had spaces in the first 5 lines, so it was # mistakenly annotated as part of the matrix, rather than part of # the annotations. Probably should look at least at the first 100 # lines. U133Av2 has 62 AFFX genes that may or may not have # annotations. #NUM_LINES = 25 NUM_LINES = 100 handle = filelib.openfh(locator_str) lines = [handle.readline() for i in range(NUM_LINES)] handle.close() # need to close it properly, or gunzip might not die. lines = [x for x in lines if x] matrix = [line.rstrip("\r\n").split("\t") for line in lines] # Make sure there's at least 1 line. if not matrix: return False # Has to have at least a header. if len(matrix) < 1: return False # All rows should contain the same number of columns. for cols in matrix: if len(cols) != len(matrix[0]): return False nr, nc = util.num_headers(matrix) nrow = hrows or nr ncol = hcols or nc # PCL requires at least the gene IDs. if ncol == 0: return False #if nrow == 0 and ncol == 0: # return False nrow = max(nrow, 1) # what is this for??? if nrow < 1 or nrow > 3: return False # PCL format has at most 4 header columns. if ncol > 4: return False #if ncol > 2: # ncol = 2 #if ncol < 2 or ncol > 4: # return False assert len(matrix) >= 1 header_def = [ (0, 1, "NAME"), (0, 2, "GWEIGHT"), (0, 3, "GORDER"), (1, 0, "EWEIGHT"), (2, 0, "EORDER"), ] for row, col, name in header_def: if nrow > row and ncol > col: if matrix[row][col].strip().upper() != name: return False return True
def main(): #from optparse import OptionParser, OptionGroup from optparse import OptionParser usage = "usage: %prog [options] <file1> <file2> ..." parser = OptionParser(usage=usage, version="%prog 01") parser.add_option("-f", "--num_factors", dest="num_factors", type="int", default=15, help="Number of factors to use for normalization.") # Any string in the control probe file can be a control probe. # Delimited by tabs and newlines. parser.add_option("", "--control_probe_file", dest="control_probe_file", default=None, help="File that contains the control probes.") parser.add_option("", "--python", dest="python", default=None, help="Specify the command to run python (optional).") parser.add_option("", "--bfrm", dest="bfrm_path", default=None, help="Specify the path to the BFRM_normalize directory.") parser.add_option("", "--matlab", dest="matlab", default="matlab", help="Specify the command to run matlab.") parser.add_option("", "--arrayplot", dest="arrayplot", default=None, help="Specify the command to run arrayplot.") parser.add_option("", "--povray", dest="povray", default="povray", help="Specify the command to run povray.") parser.add_option("", "--cluster", dest="cluster", default=None, help="Specify the command to run cluster.") parser.add_option("", "--libpath", dest="libpath", action="append", default=[], help="Add to the Python library search path.") parser.add_option("-o", "--outpath", dest="outpath", type="string", default=None, help="Save files in this path.") parser.add_option("-z", "--archive", dest="archive", action="store_true", default=None, help="Archive the raw output. Helpful for GenePattern.") # Parse the arguments. options, args = parser.parse_args() if options.libpath: sys.path = options.libpath + sys.path # Import this after the library path is set. import time import arrayio from genomicode import filelib from genomicode import archive from genomicode import genepattern start_time = time.time() genepattern.fix_environ_path() if not args: parser.error("Please specify files to normalize.") filenames = args names = [os.path.split(x)[-1] for x in filenames] for filename in filenames: assert filelib.exists(filename), "File not found: %s" % filename # Check to make sure value for num_factors is reasonable. MIN_FACTORS, MAX_FACTORS = 1, 100 if options.num_factors < MIN_FACTORS: if MIN_FACTORS == 1: parser.error("At least %d factor is required." % MIN_FACTORS) else: parser.error("At least %d factors are required." % MIN_FACTORS) elif options.num_factors > MAX_FACTORS: parser.error("%d factors is too many. Maximum is %d." % (options.num_factors, MAX_FACTORS)) # Set up the files. file_layout = make_file_layout(options.outpath) init_paths(file_layout) # Read each of the input files and align them. matrices = read_matrices(filenames) # Make sure the number of factors don't exceed the size of the # matrices. if matrices and options.num_factors > matrices[0].nrow(): parser.error("Too many factors.") # Standardize each of the matrices to GCT format. if 1: # for debugging for i in range(len(matrices)): matrices[i] = arrayio.convert(matrices[i], to_format=arrayio.gct_format) write_dataset(file_layout.DS_ORIG, matrices) # Log each of the matrices if needed. if 1: # for debugging log_matrices(names, matrices) write_dataset(file_layout.DS_PROC, matrices) sys.stdout.flush() # Format the parameters and output files for bfrm. if 1: # for debugging run_bfrm(options.bfrm_path, options.num_factors, options.control_probe_file, file_layout, options.matlab) # Generate some files for output. if 1: # for debugging summarize_dataset(file_layout) summarize_filtered_genes(file_layout) summarize_heatmaps(options.python, options.arrayplot, options.cluster, file_layout, options.libpath) summarize_pca(options.povray, file_layout, matrices) summarize_report(filenames, matrices, options.num_factors, start_time, file_layout) # Archive the BFRM stuff, and the big files. if options.archive: print "Archiving results." archive.zip_path(file_layout.BFRM, noclobber=False) archive.zip_path(file_layout.ATTIC, noclobber=False) #archive.zip_path(file_layout.DS_PROC, noclobber=False) #archive.zip_path(file_layout.DS_FINAL, noclobber=False) print "Done."
def run(self, network, antecedents, out_attributes, user_options, num_cores, out_path): import os from genomicode import filelib from genomicode import alignlib from genomicode import parallel from genomicode import hashlib from Betsy import module_utils as mlib fastq_node, sample_node, strand_node, reference_node = antecedents fastq_files = mlib.find_merged_fastq_files(sample_node.identifier, fastq_node.identifier) assert fastq_files, "I could not find any FASTQ files." ref = alignlib.create_reference_genome(reference_node.identifier) stranded = mlib.read_stranded(strand_node.identifier) filelib.safe_mkdir(out_path) metadata = {} metadata["tool"] = "RSEM %s" % alignlib.get_rsem_version() # Figure out whether to align to genome or transcriptome. x = out_attributes["align_to"] assert x in ["genome", "transcriptome"] align_to_genome = (x == "genome") # RSEM makes files: # <sample_name>.genome.bam # <sample_name>.transcript.bam # <sample_name>.genes.results # <sample_name>.isoforms.results # <sample_name>.stat # # Does not work right if there is a space in the sample name. # Therefore, give a hashed sample name, and then re-name # later. # Make a list of the jobs to run. jobs = [] for x in fastq_files: sample, pair1, pair2 = x sample_h = hashlib.hash_var(sample) x1, x2, x3 = mlib.splitpath(pair1) x = "%s%s" % (hashlib.hash_var(x2), x3) pair1_h = os.path.join(out_path, x) if pair2: x1, x2, x3 = mlib.splitpath(pair2) x = "%s%s" % (hashlib.hash_var(x2), x3) pair2_h = os.path.join(out_path, x) results_filename = os.path.join(out_path, "%s.genes.results" % sample) log_filename = os.path.join(out_path, "%s.log" % sample) x = filelib.GenericObject(sample=sample, sample_h=sample_h, pair1=pair1, pair2=pair2, pair1_h=pair1_h, pair2_h=pair2_h, results_filename=results_filename, log_filename=log_filename) jobs.append(x) # Make sure hashed samples are unique. seen = {} for j in jobs: assert j.sample_h not in seen, \ "Dup (%d): %s" % (len(jobs), j.sample_h) assert j.pair1_h not in seen assert j.pair2_h not in seen seen[j.sample_h] = 1 seen[j.pair1_h] = 1 seen[j.pair2_h] = 1 # Symlink the fastq files. for j in jobs: os.symlink(j.pair1, j.pair1_h) if j.pair2: os.symlink(j.pair2, j.pair2_h) s2fprob = { "unstranded": None, "firststrand": 0.0, "secondstrand": 1.0, } assert stranded.stranded in s2fprob, "Unknown stranded: %s" % \ stranded.stranded forward_prob = s2fprob[stranded.stranded] # How much memory for bowtie. May need to increase this if # there are lots of memory warnings in the log files: # Warning: Exhausted best-first chunk memory for read # ST-J00106:110:H5NY5BBXX:6:1101:18203:44675 1:N:0:1/1 # (patid 2076693); skipping read # Default is 64. # Seems like too high a value can cause problems. #chunkmbs = 4*1024 # Generates warnings. chunkmbs = 512 # Get lots of warnings with bowtie: # Warning: Detected a read pair whose two mates have different names # Use STAR aligner instead. use_STAR = True sq = parallel.quote commands = [] for j in jobs: # Debug: If the results file exists, don't run it again. if filelib.exists_nz(j.results_filename) and \ filelib.exists(j.log_filename): continue # If using the STAR aligner, then most memory efficient # way is to let STAR take care of the multiprocessing. nc = max(1, num_cores / len(jobs)) if use_STAR: nc = num_cores keywds = {} if use_STAR: keywds["align_with_star"] = True else: keywds["align_with_bowtie2"] = True x = alignlib.make_rsem_command(ref.fasta_file_full, j.sample_h, j.pair1_h, fastq_file2=j.pair2_h, forward_prob=forward_prob, output_genome_bam=align_to_genome, bowtie_chunkmbs=chunkmbs, num_threads=nc, **keywds) x = "%s >& %s" % (x, sq(j.log_filename)) commands.append(x) metadata["commands"] = commands metadata["num cores"] = num_cores # Need to run in out_path. Otherwise, files will be everywhere. nc = num_cores if use_STAR: nc = 1 parallel.pshell(commands, max_procs=nc, path=out_path) # Rename the hashed sample names back to the original unhashed # ones. files = os.listdir(out_path) rename_files = [] # list of (src, dst) for j in jobs: if j.sample == j.sample_h: continue for f in files: if not f.startswith(j.sample_h): continue src = os.path.join(out_path, f) x = j.sample + f[len(j.sample_h):] dst = os.path.join(out_path, x) rename_files.append((src, dst)) for src, dst in rename_files: filelib.assert_exists(src) os.rename(src, dst) # Delete the symlinked fastq files. for j in jobs: filelib.safe_unlink(j.pair1_h) filelib.safe_unlink(j.pair2_h) # Make sure the analysis completed successfully. x1 = [x.results_filename for x in jobs] x2 = [x.log_filename for x in jobs] filelib.assert_exists_nz_many(x1 + x2) return metadata