def DumpGOFromDatabase(outfile, dbhandle, options): """read go assignments from database. and dump them into a flatfile. (one to many mapping of genes to GO categories) and a dictionary of go-term to go information """ E.info("category\ttotal\tgenes\tcategories") all_genes = collections.defaultdict(int) all_categories = collections.defaultdict(int) all_ntotal = 0 outfile.write("go_type\tgene_id\tgo_id\tdescription\tevidence\n") for go_type in options.ontology: genes = collections.defaultdict(int) categories = collections.defaultdict(int) ntotal = 0 statement = GetGOStatement(go_type, options.database_name, options.species) results = Database.executewait(dbhandle, statement, retries=0).fetchall() for result in results: outfile.write("\t".join(map(str, (go_type, ) + result)) + "\n") gene_id, goid, description, evidence = result genes[gene_id] += 1 categories[goid] += 1 ntotal += 1 all_genes[gene_id] += 1 all_categories[goid] += 1 all_ntotal += 1 E.info("%s\t%i\t%i\t%i" % (go_type, ntotal, len(genes), len(categories))) E.info("%s\t%i\t%i\t%i" % ("all", all_ntotal, len(all_genes), len(all_categories))) return
def splitFiles(infile, nchunks, out_dir): ''' Give files names based on splitting into an arbitrary number of chunks ''' df = pd.read_table(infile, sep="\t", header=0, index_col=0) total = len(df.index.tolist()) # split into aribitrary number of chunks, or arbitrary chunk size? # small n bad for large input size, large n bad for small input size # set min/max chunk size, e.g. 100 genes minimum, 500 maximum? if total/nchunks < 100: step = 100 E.warn("too few genes in each chunk, resetting to 100 genes per chunk") elif total/nchunks > 500: step = 500 E.warn("too many genes per chunk, resetting to 500 genes per chunk") else: step = total/nchunks E.info("chunking input file into %i chunks" % step) file_pattern = infile.split("/")[1].rstrip("-expression.tsv") idx = 0 for i in range(step, total, step): start = "%s" % idx end = "%s" % i file_name = "%s/%s-%s_%s-split.tsv" % (out_dir, file_pattern, start, end) with open(file_name, "w") as file_handle: file_handle.write(file_name + "\n") idx = i # final file start = "%s" % idx end = "%s" % total file_name = "%s/%s-%s_%s-split.tsv" % (out_dir, file_pattern, start, end) with open(file_name, "w") as file_handle: file_handle.write(file_name + "\n")
def parseHeader(self, infile, outfile, options): """parse header in infile.""" # skip comments until header while 1: l = infile.readline() if not l: break if self.header_regex: if self.header_regex.search(l): break elif l[0] != "#": break options.stdlog.write(l) # print only the first header and check if # all the headers are the same. if self.header: if self.header != l: raise ValueError("inconsistent header in file %s\n" "got=%s\nexpected=%s" % (infile, l, self.header)) else: outfile.write(l) self.header = l self.nfields = l.count("\t") if self.nfields == 0: E.warn("only single column in header: %s" % l[:-1]) if self.mFieldIndex is None and self.mFieldName: try: self.mFieldIndex = self.header.split("\t").index( self.mFieldName) except ValueError: E.warn("no mapping, can not find field %s in %s" % (self.mFieldName, self.header)) self.mFieldName = None E.debug("substituting field: %s, %s" % (self.mFieldName, self.mFieldIndex))
def annotate(infile, annotation_file, outfile): ''' annotate infile with annotations from annotation gtf file ''' inf = open(infile) header = inf.readline() include = set() E.info("reading genes to keep") for line in inf.readlines(): data = line[:-1].split("\t") gene_id = data[8].strip('"') include.add(gene_id) E.info("reading annotations file") annotations = {} for gtf in GTF.iterator(IOTools.openFile(annotation_file)): if gtf.gene_id in include: annotations[gtf.gene_id] = \ [gtf.gene_name, gtf.species, gtf.description] inf = open(infile) header = inf.readline() E.info("writing results with annotations") outf = open(outfile, "w") outf.write(header.strip("\n") + "\tgene_name\tspecies_centroid\tdescription\n") for line in inf.readlines(): data = line[:-1].split("\t") gene_id = data[8].strip('"') try: outf.write("\t".join(data + annotations[gene_id]) + "\n") except KeyError: outf.write("\t".join(data + ["NA", "NA", "NA"]) + "\n") outf.close()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--test", dest="test", type="string", help="supply help") parser.add_option("--plot-type", dest="plot_type", type="choice", choices=["manhattan", "qqplot", "epistasis"], help="plot type to generate") parser.add_option("--resolution", dest="resolution", type="choice", choices=["genome_wide", "chromosome", "fine_map"], help="the resolution of plotting, wether the plot " "depicts the whole genome, a single chromosome or " "a specific locus") parser.add_option("--file-format", dest="file_format", type="choice", choices=["plink", "cassi", "cassi_covar"], help="input file format, used to parse the file " "properly") parser.add_option("--save-path", dest="save_path", type="string", help="path and filename to save image to") # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) parser.set_defaults(resolution="genome_wide", plot_type="manhattan", file_format="plink") # if the input is a list of files, split them infile = argv[-1] infiles = infile.split(",") # need to parse epistasis output slightly differently if options.plot_type == "epistasis": epi = True else: epi = False if len(infiles) > 1: results = gwas.GWASResults(assoc_file=infiles, epistasis=epi, file_format=options.file_format) elif len(infiles) == 1: results = gwas.GWASResults(assoc_file=infile, epistasis=epi, file_format=options.file_format) else: raise IOError("no input files detected, please specifiy association " "results files as the last command line argument") if options.plot_type == "manhattan": df = results.plotManhattan(resolution=options.resolution, save_path=options.save_path) elif options.plot_type == "qqplot": results.plotQQ(save_path=options.save_path, resolution=options.resolution) elif options.plot_type == "epistasis": results.plotEpistasis(save_path=options.save_path, resolution=options.resolution) else: pass # only output appended results for Manhattan plot, not qqplot try: df.to_csv(options.stdout, sep="\t", index=None) except UnboundLocalError: pass # write footer and output benchmark information. E.stop()
def mergeVariants(variants): '''merge overlapping variants. Overlapping variants occur if there are two deletions at the same location: WT ACTG Allele1 -CT- Allele2 ---- This will be encoded by samtools as (0-based coordinates):: 0 * -A/ACTG 3 * -G/-G This upsets the re-constitution algoritm. This method separates these two variants into two non-overlapping variants making use of variable length deletions. 0 * -A/-A 1 * ---G/-CTG Another case: WT ACTG Allele1 ACT- Allele2 ---- This will be encoded by samtools as (0-based coordinates):: 0 * */-ACTG 3 * -G/* This method separates these two as:: 0 * */-ACT 3 * -G/-G ''' if len(variants) == 0: return [] # sorts by start and then end variants.sort() merged_variants = [] def _add(offset, dest, src): for x, c in enumerate(src): dest[x + offset] = c def _split(seq0, seq1): # split was_0, was_1 = seq0[0] == "-", seq1[0] == "-" for x, cc in enumerate(zip(seq0, seq1)): is_0, is_1 = cc[0] == "-", cc[1] == "-" # yield all changes if (is_0 ^ was_0) or (is_1 ^ was_1): yield x, was_0, was_1 was_0, was_1 = is_0, is_1 yield x + 1, was_0, was_1 last = variants[0] for this in variants[1:]: if this.start < last.end and \ this.action == "-" and \ last.action == "-": E.warn("merging overlapping deletions: %s and %s" % (str(last), str(this))) mend = max(last.end, this.end) mstart = min(this.start, last.start) l = mend - mstart seq0 = list("-" * l) seq1 = list("-" * l) _add(last.start - mstart, seq0, last.variantseqs[0]) _add(last.start - mstart, seq1, last.variantseqs[1]) _add(this.start - mstart, seq0, this.variantseqs[0]) _add(this.start - mstart, seq1, this.variantseqs[1]) last_x = 0 n = [] for x, was_0, was_1 in _split(seq0, seq1): if last_x == x: continue this = ExtendedVariant._make(( mstart + last_x, mstart + x, "*", last.action, was_0 ^ was_1, ["".join(seq0[last_x:x]), "".join(seq1[last_x:x])], )) n.append(this) last_x = x E.warn("overlapping deletions merged in %i blocks as: %s" % (len(n), list(map(str, n)))) merged_variants.extend(n[:-1]) this = n[-1] else: merged_variants.append(last) last = this merged_variants.append(last) return merged_variants
def main(argv=None): parser = getOptionParser() (options, args) = E.Start(parser, add_cluster_options=True) if len(args) == 0: raise ValueError( "command line argument missing - see usage information") options.renumber_column = [x.split(":") for x in options.renumber_column] cmd = args[0] if len(args) > 1: cmd += " '" + "' '".join(args[1:]) + "'" if options.dry_run: cmd = re.sub("%DIR%", "", cmd) retcode = subprocess.call(cmd, shell=True, stdin=sys.stdin, stdout=sys.stdout, cwd=os.getcwd(), close_fds=True) E.Stop() sys.exit(0) failed_requests = [] started_requests = [] niterations = 0 if not options.collect: tmpdir = os.path.abspath(tempfile.mkdtemp(dir=options.tmpdir)) E.info(" working in directory %s" % tmpdir) if options.split_at_lines: chunk_iterator = chunk_iterator_lines args = (options.split_at_lines, ) elif options.split_at_column: chunk_iterator = chunk_iterator_column args = (options.split_at_column - 1, options.max_files) elif options.split_at_regex: chunk_iterator = chunk_iterator_regex_split args = (re.compile(options.split_at_regex), 0, options.chunksize, options.max_lines) elif options.group_by_regex: chunk_iterator = chunk_iterator_regex_group args = (re.compile(options.group_by_regex), 0, options.chunksize) else: raise ValueError("please specify a way to chunk input data") data = [(x, cmd, options, None, options.subdirs) for x in chunk_iterator(options.stdin, args, prefix=tmpdir, use_header=options.input_header)] started_requests = [(x[0], x[0] + ".out") for x in data] if len(data) == 0: E.warn("no data received") E.Stop() sys.exit(0) if options.method == "multiprocessing": pool = Pool(options.cluster_num_jobs) results = pool.map(runCommand, data, chunksize=1) elif options.method == "drmaa": results = [] runDRMAA(data, environment=options.environment) elif options.method == "threads": pool = ThreadPool(options.cluster_num_jobs) results = pool.map(runCommand, data, chunksize=1) niterations = 0 for retcode, filename, cmd, logfile, iterations in results: niterations += iterations if not hasFinished(retcode, filename, options.output_tag, logfile): failed_requests.append((filename, cmd)) else: tmpdir = options.collect started_requests = [(x[:-4], x) for x in glob.glob(tmpdir + "/*.out")] E.info("collecting %i files from %s" % (len(started_requests), tmpdir)) if failed_requests: for fn, cmd in failed_requests: E.error("failed request: filename= %s, cmd= %s" % (fn, cmd)) else: E.info("building result from %i parts" % len(started_requests)) if options.renumber: mapper = MapperLocal(pattern=options.renumber) else: mapper = MapperEmpty() # deal with stdout name = None index = None for pattern, column in options.renumber_column: if re.search(pattern, "stdout"): try: index = int(column) - 1 except ValueError: name = column break if options.binary: ResultBuilderBinary()(started_requests, options.stdout, options) else: regex = None if options.output_regex_header: regex = re.compile(options.output_regex_header) ResultBuilder(mapper=mapper, field_index=index, field_name=name, header_regex=regex)(started_requests, options.stdout, options) # deal with logfiles : combine them into a single file rr = re.search("'--log=(\S+)'", cmd) or re.search("'--L\s+(\S+)'", cmd) if rr: E.info("logging output goes to %s" % rr.groups()[0]) logfile = IOTools.openFile(rr.groups()[0], "a") ResultBuilderLog()([(x[0], "%s.log" % x[0]) for x in started_requests], logfile, options) logfile.close() # deal with other files if options.subdirs: files = glob.glob("%s/*.dir/*" % tmpdir) # remove directory filenames = set([os.path.basename(x) for x in files]) xx = len(".out") for filename in filenames: _, filetype = os.path.splitext(filename) name = None index = None for pattern, column in options.renumber_column: if re.search(pattern, filename): try: index = int(column) - 1 except ValueError: name = column break if options.binary: builder = ResultBuilderBinary(mapper=mapper) elif filetype in (".fa", ".fasta"): builder = ResultBuilderFasta(mapper=mapper) elif filetype in (".mali", ): builder = ResultBuilderFasta(mapper=MapperEmpty()) elif filetype in (".psl"): builder = ResultBuilderPSL(mapper=mapper) elif filetype in (".gtf", ".gff"): builder = ResultBuilderGFF(mapper=mapper, field_index=index, field_name=name) elif filetype in (".png"): builder = ResultBuilderCopies(mapper=mapper) else: builder = ResultBuilder(mapper=mapper, field_index=index, field_name=name) E.debug("chose the following builder for %s: %s: %s" % (filename, filetype, str(builder))) E.info("collecting results for %s" % filename) input_filenames = [] for fi, fn in started_requests: fn = fn[:-xx] + ".dir/" + filename if os.path.exists(fn): input_filenames.append((fi, fn)) E.info("output of %i files goes to %s" % (len(filenames), filename)) outfile = IOTools.openFile(options.output_pattern % filename, "w") builder(input_filenames, outfile, options) outfile.close() if not options.debug and (not options.resume or not options.collect): if len(failed_requests) == 0: E.info("removing directory %s" % tmpdir) shutil.rmtree(tmpdir) else: E.info("directory %s not removed due to %i failed jobs" % (tmpdir, len(failed_requests))) E.info("job control: nstarted=%i, nfinished=%i, nerrors=%i, nrepeats=%i" % (len(started_requests), len(started_requests) - len(failed_requests), len(failed_requests), niterations)) E.Stop()
def runDRMAA(data, environment): '''run jobs in data using drmaa to connect to the cluster.''' # SNS: Error dection now taken care of with Cluster.py # expandStatement function # working directory - needs to be the one from which the # the script is called to resolve input files. cwd = os.getcwd() session = drmaa.Session() session.initialize() jobids = [] kwargs = {} for filename, cmd, options, tmpdir, subdirs in data: from_stdin, to_stdout = True, True if subdirs: outdir = "%s.dir/" % (filename) os.mkdir(outdir) cmd = re.sub("%DIR%", outdir, cmd) x = re.search("'--log=(\S+)'", cmd) or re.search("'--L\s+(\S+)'", cmd) if x: logfile = filename + ".log" cmd = cmd[:x.start()] + "--log=%s" % logfile + cmd[x.end():] else: logfile = filename + ".out" if "%STDIN%" in cmd: cmd = re.sub("%STDIN%", filename, cmd) from_stdin = False if "%STDOUT%" in cmd: cmd = re.sub("%STDOUT%", filename + ".out", cmd) to_stdout = False cmd = " ".join(re.sub("\t+", " ", cmd).split("\n")) E.info("running statement:\n%s" % cmd) job_script = tempfile.NamedTemporaryFile(dir=os.getcwd(), delete=False, mode="w+t") job_script.write("#!/bin/bash\n") # -l -O expand_aliases\n" ) job_script.write(Cluster.expandStatement(cmd) + "\n") job_script.close() job_path = os.path.abspath(job_script.name) os.chmod(job_path, stat.S_IRWXG | stat.S_IRWXU) # get session for process - only one is permitted job_name = os.path.basename(kwargs.get("outfile", "farm.py")) options_dict = vars(options) options_dict["workingdir"] = os.getcwd() if options.job_memory: job_memory = options.job_memory elif options.cluster_memory_default: job_memory = options.cluster_memory_default else: job_memory = "2G" jt = Cluster.setupDrmaaJobTemplate(session, options_dict, job_name, job_memory) jt.remoteCommand = job_path # update the environment e = {'BASH_ENV': options.bashrc} if environment: for en in environment: try: e[en] = os.environ[en] except KeyError: raise KeyError( "could not export environment variable '%s'" % en) jt.jobEnvironment = e # SNS: Native specifation setting abstracted # to Pipeline/Cluster.setupDrmaaJobTemplate() # use stdin for data if from_stdin: jt.inputPath = ":" + filename # set paths. # later: allow redirection of stdout and stderr to files # could this even be across hosts? if to_stdout: jt.outputPath = ":" + filename + ".out" else: jt.outputPath = ":" + filename + ".stdout" jt.errorPath = ":" + filename + ".err" jobid = session.runJob(jt) jobids.append((jobid, job_path, filename, cmd, logfile)) E.debug("%i jobs have been submitted" % len(jobids)) results = [] for jobid, job_path, filename, cmd, logfile in jobids: try: retval = session.wait(jobid, drmaa.Session.TIMEOUT_WAIT_FOREVER) except Exception as msg: # ignore message 24 in PBS # code 24: drmaa: Job finished but resource usage information # and/or termination status could not be provided.": if not msg.message.startswith("code 24"): raise retval = None if retval and retval.exitStatus != 0: raise OSError("Child was terminated by signal %i: \n%s\n" % (retval.exitStatus, cmd)) results.append((retval, filename, cmd, logfile, 1)) os.unlink(job_path) session.deleteJobTemplate(jt) session.exit()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="methods", type="choice", action="append", choices=( "geneprofile", "tssprofile", "utrprofile", "intervalprofile", "midpointprofile", "geneprofilewithintrons", "geneprofileabsolutedistancefromthreeprimeend", "separateexonprofile", "separateexonprofilewithintrons", ), help='counters to use. Counters describe the ' 'meta-gene structure to use. ' 'Note using geneprofilewithintrons, or ' 'geneprofileabsolutedistancefromthreeprimeend will ' 'automatically turn on the --use-base-accuracy option' '[%default].') parser.add_option("-b", "--bam-file", "--bedfile", "--bigwigfile", dest="infiles", metavar="BAM", type="string", action="append", help="BAM/bed/bigwig files to use. Do not mix " "different types [%default]") parser.add_option("-c", "--control-bam-file", dest="controlfiles", metavar="BAM", type="string", action="append", help="control/input to use. Should be of the same " "type as the bam/bed/bigwig file" " [%default]") parser.add_option("-g", "--gtf-file", dest="gtffile", type="string", metavar="GTF", help="GTF file to use. " "[%default]") parser.add_option("--normalize-transcript", dest="transcript_normalization", type="choice", choices=("none", "max", "sum", "total-max", "total-sum"), help="normalization to apply on each transcript " "profile before adding to meta-gene profile. " "[%default]") parser.add_option("--normalize-profile", dest="profile_normalizations", type="choice", action="append", choices=("all", "none", "area", "counts", "background"), help="normalization to apply on meta-gene " "profile normalization. " "[%default]") parser.add_option( "-r", "--reporter", dest="reporter", type="choice", choices=("gene", "transcript"), help="report results for genes or transcripts." " When 'genes` is chosen, exons across all transcripts for" " a gene are merged. When 'transcript' is chosen, counts are" " computed for each transcript separately with each transcript" " contributing equally to the meta-gene profile." " [%default]") parser.add_option("-i", "--shift-size", dest="shifts", type="int", action="append", help="shift reads in :term:`bam` formatted file " "before computing densities (ChIP-Seq). " "[%default]") parser.add_option("-a", "--merge-pairs", dest="merge_pairs", action="store_true", help="merge pairs in :term:`bam` formatted " "file before computing " "densities (ChIP-Seq). " "[%default]") parser.add_option("-u", "--use-base-accuracy", dest="base_accuracy", action="store_true", help="compute densities with base accuracy. The default " "is to only use the start and end of the aligned region " "(RNA-Seq) " "[%default]") parser.add_option("-e", "--extend", dest="extends", type="int", action="append", help="extend reads in :term:`bam` formatted file " "(ChIP-Seq). " "[%default]") parser.add_option("--resolution-upstream", dest="resolution_upstream", type="int", help="resolution of upstream region in bp " "[%default]") parser.add_option("--resolution-downstream", dest="resolution_downstream", type="int", help="resolution of downstream region in bp " "[%default]") parser.add_option("--resolution-upstream-utr", dest="resolution_upstream_utr", type="int", help="resolution of upstream UTR region in bp " "[%default]") parser.add_option("--resolution-downstream-utr", dest="resolution_downstream_utr", type="int", help="resolution of downstream UTR region in bp " "[%default]") parser.add_option("--resolution-cds", dest="resolution_cds", type="int", help="resolution of cds region in bp " "[%default]") parser.add_option("--resolution-first-exon", dest="resolution_first", type="int", help="resolution of first exon in gene, in bp" "[%default]") parser.add_option("--resolution-last-exon", dest="resolution_last", type="int", help="resolution of last exon in gene, in bp" "[%default]") parser.add_option("--resolution-introns", dest="resolution_introns", type="int", help="resolution of introns region in bp " "[%default]") parser.add_option("--resolution-exons-absolute-distance-topolya", dest="resolution_exons_absolute_distance_topolya", type="int", help="resolution of exons absolute distance " "topolya in bp " "[%default]") parser.add_option("--resolution-introns-absolute-distance-topolya", dest="resolution_introns_absolute_distance_topolya", type="int", help="resolution of introns absolute distance " "topolya in bp " "[%default]") parser.add_option("--extension-exons-absolute-distance-topolya", dest="extension_exons_absolute_distance_topolya", type="int", help="extension for exons from the absolute " "distance from the topolya in bp " "[%default]") parser.add_option( "--extension-introns-absolute-distance-topolya", dest="extension_introns_absolute_distance_topolya", type="int", help="extension for introns from the absolute distance from " "the topolya in bp [%default]") parser.add_option("--extension-upstream", dest="extension_upstream", type="int", help="extension upstream from the first exon in bp" "[%default]") parser.add_option("--extension-downstream", dest="extension_downstream", type="int", help="extension downstream from the last exon in bp" "[%default]") parser.add_option("--extension-inward", dest="extension_inward", type="int", help="extension inward from a TSS start site in bp" "[%default]") parser.add_option("--extension-outward", dest="extension_outward", type="int", help="extension outward from a TSS start site in bp" "[%default]") parser.add_option("--scale-flank-length", dest="scale_flanks", type="int", help="scale flanks to (integer multiples of) gene length" "[%default]") parser.add_option( "--control-factor", dest="control_factor", type="float", help="factor for normalizing control and foreground data. " "Computed from data if not set. " "[%default]") parser.add_option("--output-all-profiles", dest="output_all_profiles", action="store_true", help="keep individual profiles for each " "transcript and output. " "[%default]") parser.add_option("--counts-tsv-file", dest="input_filename_counts", type="string", help="filename with count data for each transcript. " "Use this instead " "of recomputing the profile. Useful for plotting the " "meta-gene profile " "from previously computed counts " "[%default]") parser.add_option( "--background-region-bins", dest="background_region_bins", type="int", help="number of bins on either end of the profile " "to be considered for background meta-gene normalization " "[%default]") parser.set_defaults( remove_rna=False, ignore_pairs=False, force_output=False, bin_size=10, extends=[], shifts=[], sort=[], reporter="transcript", resolution_cds=1000, resolution_introns=1000, # 3kb is a good balance of seeing long enough 3 prime bias and not omit # too many genes. Tim 31th Aug 2013 resolution_exons_absolute_distance_topolya=3000, # introns is only for assess the noise level, thus do ont need a long # region, a long region has the side effect of omit more genes. Tim # 31th Aug 2013 resolution_introns_absolute_distance_topolya=500, # extension can simply just be the same as resolution extension_exons_absolute_distance_topolya=3000, extension_introns_absolute_distance_topolya=500, resolution_upstream_utr=1000, resolution_downstream_utr=1000, resolution_upstream=1000, resolution_downstream=1000, resolution_first=1000, resolution_last=1000, # mean length of transcripts: about 2.5 kb extension_upstream=2500, extension_downstream=2500, extension_inward=3000, extension_outward=3000, plot=True, methods=[], infiles=[], controlfiles=[], gtffile=None, profile_normalizations=[], transcript_normalization=None, scale_flanks=0, merge_pairs=False, min_insert_size=0, max_insert_size=1000, base_accuracy=False, matrix_format="single", control_factor=None, output_all_profiles=False, background_region_bins=10, input_filename_counts=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) # Keep for backwards compatability if len(args) == 2: infile, gtf = args options.infiles.append(infile) options.gtffile = gtf if not options.gtffile: raise ValueError("no GTF file specified") if options.gtffile == "-": options.gtffile = options.stdin else: options.gtffile = IOTools.open_file(options.gtffile) if len(options.infiles) == 0: raise ValueError("no bam/wig/bed files specified") for methodsRequiresBaseAccuracy in [ "geneprofilewithintrons", "geneprofileabsolutedistancefromthreeprimeend", ]: # If you implemented any methods that you do not want the # spliced out introns or exons appear to be covered by # non-existent reads, it is better you let those methods imply # --base-accurarcy by add them here. if methodsRequiresBaseAccuracy in options.methods: options.base_accuracy = True if options.reporter == "gene": gtf_iterator = GTF.flat_gene_iterator(GTF.iterator(options.gtffile)) elif options.reporter == "transcript": gtf_iterator = GTF.transcript_iterator(GTF.iterator(options.gtffile)) # Select rangecounter based on file type if len(options.infiles) > 0: if options.infiles[0].endswith(".bam"): bamfiles = [pysam.AlignmentFile(x, "rb") for x in options.infiles] if options.controlfiles: controlfiles = [ pysam.AlignmentFile(x, "rb") for x in options.controlfiles ] else: controlfiles = None format = "bam" if options.merge_pairs: range_counter = _bam2geneprofile.RangeCounterBAM( bamfiles, shifts=options.shifts, extends=options.extends, merge_pairs=options.merge_pairs, min_insert_size=options.min_insert_size, max_insert_size=options.max_insert_size, controfiles=controlfiles, control_factor=options.control_factor) elif options.shifts or options.extends: range_counter = _bam2geneprofile.RangeCounterBAM( bamfiles, shifts=options.shifts, extends=options.extends, controlfiles=controlfiles, control_factor=options.control_factor) elif options.base_accuracy: range_counter = _bam2geneprofile.RangeCounterBAMBaseAccuracy( bamfiles, controlfiles=controlfiles, control_factor=options.control_factor) else: range_counter = _bam2geneprofile.RangeCounterBAM( bamfiles, controlfiles=controlfiles, control_factor=options.control_factor) elif options.infiles[0].endswith(".bed.gz"): bedfiles = [pysam.Tabixfile(x) for x in options.infiles] if options.controlfiles: controlfiles = [ pysam.Tabixfile(x) for x in options.controlfiles ] else: controlfiles = None range_counter = _bam2geneprofile.RangeCounterBed( bedfiles, controlfiles=controlfiles, control_factor=options.control_factor) elif options.infiles[0].endswith(".bw"): wigfiles = [BigWigFile(file=open(x)) for x in options.infiles] range_counter = _bam2geneprofile.RangeCounterBigWig(wigfiles) else: raise NotImplementedError("can't determine file type for %s" % str(options.infiles)) counters = [] for method in options.methods: if method == "utrprofile": counters.append( _bam2geneprofile.UTRCounter( range_counter, options.resolution_upstream, options.resolution_upstream_utr, options.resolution_cds, options.resolution_downstream_utr, options.resolution_downstream, options.extension_upstream, options.extension_downstream, )) elif method == "geneprofile": counters.append( _bam2geneprofile.GeneCounter( range_counter, options.resolution_upstream, options.resolution_cds, options.resolution_downstream, options.extension_upstream, options.extension_downstream, options.scale_flanks)) elif method == "geneprofilewithintrons": counters.append( _bam2geneprofile.GeneCounterWithIntrons( range_counter, options.resolution_upstream, options.resolution_cds, options.resolution_introns, options.resolution_downstream, options.extension_upstream, options.extension_downstream, options.scale_flanks)) elif method == "geneprofileabsolutedistancefromthreeprimeend": # options.extension_exons_absolute_distance_tostartsite, # options.extension_introns_absolute_distance_tostartsite, # Tim 31th Aug 2013: a possible feature for future, if five prime # bias is of your interest. # (you need to create another class). It is not very difficult to # derive from this class, but is not implemented yet # This future feature is slightly different the TSS profile # already implemented, because in this future feature introns are # skipped, counters.append( _bam2geneprofile.GeneCounterAbsoluteDistanceFromThreePrimeEnd( range_counter, options.resolution_upstream, options.resolution_downstream, options.resolution_exons_absolute_distance_topolya, options.resolution_introns_absolute_distance_topolya, options.extension_upstream, options.extension_downstream, options.extension_exons_absolute_distance_topolya, options.extension_introns_absolute_distance_topolya, options.scale_flanks)) elif method == "tssprofile": counters.append( _bam2geneprofile.TSSCounter(range_counter, options.extension_outward, options.extension_inward)) elif method == "intervalprofile": counters.append( _bam2geneprofile.RegionCounter(range_counter, options.resolution_upstream, options.resolution_cds, options.resolution_downstream, options.extension_upstream, options.extension_downstream)) elif method == "midpointprofile": counters.append( _bam2geneprofile.MidpointCounter(range_counter, options.resolution_upstream, options.resolution_downstream, options.extension_upstream, options.extension_downstream)) # add new method to split 1st and last exons out # requires a representative transcript for reach gene # gtf should be sorted gene-position elif method == "separateexonprofile": counters.append( _bam2geneprofile.SeparateExonCounter( range_counter, options.resolution_upstream, options.resolution_first, options.resolution_last, options.resolution_cds, options.resolution_downstream, options.extension_upstream, options.extension_downstream)) elif method == "separateexonprofilewithintrons": counters.append( _bam2geneprofile.SeparateExonWithIntronCounter( range_counter, options.resolution_upstream, options.resolution_first, options.resolution_last, options.resolution_cds, options.resolution_introns, options.resolution_downstream, options.extension_upstream, options.extension_downstream)) # set normalization for c in counters: c.setNormalization(options.transcript_normalization) if options.output_all_profiles: c.setOutputProfiles( IOTools.open_file( E.getOutputFile(c.name) + ".profiles.tsv.gz", "w")) if options.input_filename_counts: # read counts from file E.info("reading counts from %s" % options.input_filename_counts) all_counts = pandas.read_csv(IOTools.open_file( options.input_filename_counts), sep='\t', header=0, index_col=0) if len(counters) != 1: raise NotImplementedError( 'counting from matrix only implemented for 1 counter.') # build counter based on reference counter counter = _bam2geneprofile.UnsegmentedCounter(counters[0]) counters = [counter] _bam2geneprofile.countFromCounts(counters, all_counts) else: E.info("starting counting with %i counters" % len(counters)) feature_names = _bam2geneprofile.countFromGTF(counters, gtf_iterator) # output matrices if not options.profile_normalizations: options.profile_normalizations.append("none") elif "all" in options.profile_normalizations: options.profile_normalizations = [ "none", "area", "counts", "background" ] for method, counter in zip(options.methods, counters): profiles = [] for norm in options.profile_normalizations: # build matrix, apply normalization profile = counter.getProfile( normalize=norm, background_region_bins=options.background_region_bins) profiles.append(profile) for x in range(1, len(profiles)): assert profiles[0].shape == profiles[x].shape # build a single matrix of all profiles for output matrix = numpy.concatenate(profiles) matrix.shape = len(profiles), len(profiles[0]) matrix = matrix.transpose() with IOTools.open_file( E.getOutputFile(counter.name) + ".matrix.tsv.gz", "w") as outfile: outfile.write("bin\tregion\tregion_bin\t%s\n" % "\t".join(options.profile_normalizations)) fields = [] bins = [] for field, nbins in zip(counter.fields, counter.nbins): fields.extend([field] * nbins) bins.extend(list(range(nbins))) for row, cols in enumerate(zip(fields, bins, matrix)): outfile.write("%i\t%s\t" % (row, "\t".join([str(x) for x in cols[:-1]]))) outfile.write("%s\n" % ("\t".join([str(x) for x in cols[-1]]))) with IOTools.open_file( E.getOutputFile(counter.name) + ".lengths.tsv.gz", "w") as outfile: counter.writeLengthStats(outfile) if options.output_all_profiles: counter.closeOutputProfiles() if options.plot: import matplotlib # avoid Tk or any X matplotlib.use("Agg") import matplotlib.pyplot as plt for method, counter in zip(options.methods, counters): if method in ("geneprofile", "geneprofilewithintrons", "geneprofileabsolutedistancefromthreeprimeend", "utrprofile", "intervalprofile", "separateexonprofile", "separateexonprofilewithintrons"): plt.figure() plt.subplots_adjust(wspace=0.05) max_scale = max([max(x) for x in counter.aggregate_counts]) for x, counts in enumerate(counter.aggregate_counts): plt.subplot(6, 1, x + 1) plt.plot(list(range(len(counts))), counts) plt.title(counter.fields[x]) plt.ylim(0, max_scale) figname = counter.name + ".full" fn = E.getOutputFile(figname) + ".png" plt.savefig(os.path.expanduser(fn)) plt.figure() points = [] cuts = [] for x, counts in enumerate(counter.aggregate_counts): points.extend(counts) cuts.append(len(counts)) plt.plot(list(range(len(points))), points) xx, xxx = 0, [] for x in cuts: xxx.append(xx + x // 2) xx += x plt.axvline(xx, color="r", ls="--") plt.xticks(xxx, counter.fields) figname = counter.name + ".detail" fn = E.getOutputFile(figname) + ".png" plt.savefig(os.path.expanduser(fn)) elif method == "tssprofile": plt.figure() plt.subplot(1, 3, 1) plt.plot( list( range(-options.extension_outward, options.extension_inward)), counter.aggregate_counts[0]) plt.title(counter.fields[0]) plt.subplot(1, 3, 2) plt.plot( list( range(-options.extension_inward, options.extension_outward)), counter.aggregate_counts[1]) plt.title(counter.fields[1]) plt.subplot(1, 3, 3) plt.title("combined") plt.plot( list( range(-options.extension_outward, options.extension_inward)), counter.aggregate_counts[0]) plt.plot( list( range(-options.extension_inward, options.extension_outward)), counter.aggregate_counts[1]) plt.legend(counter.fields[:2]) fn = E.getOutputFile(counter.name) + ".png" plt.savefig(os.path.expanduser(fn)) elif method == "midpointprofile": plt.figure() plt.plot(numpy.arange(-options.resolution_upstream, 0), counter.aggregate_counts[0]) plt.plot(numpy.arange(0, options.resolution_downstream), counter.aggregate_counts[1]) fn = E.getOutputFile(counter.name) + ".png" plt.savefig(os.path.expanduser(fn)) # write footer and output benchmark information. E.stop()
def _write_tabbed(self, name, lines, E): outfile = E.openOutputFile(name) outfile.write('\n'.join(lines)) outfile.write('\n') outfile.close
def concatenate_tables(outfile, options, args): '''concatenate tables.''' missing_value = options.missing_value rx = re.compile(options.regex_filename) if options.headers is None or options.headers == "auto": row_headers = [[y for y in rx.search(x).groups()] for x in options.filenames] else: row_headers = [options.headers] tables, headers = [], [] # read all tables for filename, header in zip(options.filenames, row_headers): table = read_table(filename, options) if len(table) == 0: E.warn("table '%s' is empty" % filename) continue tables.append(table) headers.append(header) row_headers = headers if options.cat is None: if len(row_headers) == 1: row_head_titles = ["filename"] else: row_head_titles = [ "pattern" + str(x) for x in range(len(row_headers)) ] else: row_head_titles = [x.strip() for x in options.cat.split(",")] if len(row_headers[0]) != len(row_head_titles): raise ValueError( "row header (%i) has different number of fields in " "regular expression than supplied by the --cat option (%i)" % (len(row_headers[0]), len(row_head_titles))) # collect titles if options.input_has_titles: titles = collections.OrderedDict() for table in tables: for key in table[0][:-1].split("\t"): # skip any titles that conflict with # the newly added titles if key in row_head_titles: continue titles[key] = 1 outfile.write("%s\t%s\n" % ("\t".join( [x for x in row_head_titles]), "\t".join(list(titles.keys())))) map_title2column = collections.defaultdict(lambda: None) for x, title in enumerate(titles.keys()): map_title2column[title] = x else: ncolumns = [len(table[0].split('\t')) for table in tables] if min(ncolumns) != max(ncolumns): raise ValueError('tables have unequal number of columns ' '(min=%i, max=%i)' % (min(ncolumns), max(ncolumns))) # create a pseudo dictionary of columns titles = collections.OrderedDict([(x, x) for x in range(min(ncolumns))]) all_titles = set(titles.keys()) for nindex, table in enumerate(tables): if options.input_has_titles: titles = table[0][:-1].split("\t") map_old2new = [map_title2column[t] for t in titles] del table[0] else: map_old2new = list(range(len(all_titles))) for l in table: data = [missing_value] * len(all_titles) for x, value in enumerate(l[:-1].split("\t")): if map_old2new[x] is None: continue data[map_old2new[x]] = value row = "\t".join([str(x) for x in row_headers[nindex]] + data) + "\n" outfile.write(row)
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-t", "--no-titles", dest="input_has_titles", action="store_false", help="no titles in input [%default].") parser.add_option("--ignore-titles", dest="ignore_titles", action="store_true", help="ignore titles in input [%default]") parser.add_option("-i", "--skip-titles", dest="skip_titles", action="store_true", help="skip output of titles.") parser.add_option("-m", "--missing-value", dest="missing_value", type="string", help="entry to use for missing values.") parser.add_option("--header-names", dest="headers", type="string", help="add headers for files as a ,-separated " "list [%default].") parser.add_option("-c", "--columns", dest="columns", type="string", help="columns to use for joining. Multiple columns " "can be specified as a comma-separated list " "[default=%default].") parser.add_option("-k", "--take", dest="take", type="string", action="append", help="columns to take. If not set, all columns " "except for " "the join columns are taken [%default]") parser.add_option("-g", "--glob", dest="glob", type="string", help="wildcard expression for table names.") parser.add_option("-s", "--sort-order", dest="sort", type="string", help="sort by column titles in particular given order: " "alphabetical|numeric|list of columns.") parser.add_option("-e", "--merge-overlapping", dest="merge", action="store_true", help="simply merge tables without matching up " "rows. [default=%default].") parser.add_option("-a", "--cat", dest="cat", type="string", help="simply concatenate tables. Adds an " "additional column called X with the filename " " [default=%default].") parser.add_option("--sort-keys", dest="sort_keys", type="choice", choices=("numeric", "alphabetic"), help="sort key columns by value.") parser.add_option("--keep-empty", dest="ignore_empty", action="store_false", help="keep empty tables. The default is " "to ignore them.") parser.add_option("--ignore-empty", dest="ignore_empty", action="store_true", help="ignore empty tables - this is " "the default [%default].") parser.add_option("--add-file-prefix", dest="add_file_prefix", action="store_true", help="add file prefix to " "columns headers. Suitable for multi-column" "tables [default=%default]") parser.add_option("--use-file-prefix", dest="use_file_prefix", action="store_true", help="use file prefix as column headers. " "Suitable for two-column tables " "[default=%default]") parser.add_option("--prefixes", dest="prefixes", type="string", help="list of prefixes to use. " ", separated list of prefixes. " "The number of prefixes need to correspond to the " "number of input files [default=%default]") parser.add_option("--regex-filename", dest="regex_filename", type="string", help="pattern to apply to filename to " "build prefix [default=%default]") parser.add_option("--regex-start", dest="regex_start", type="string", help="regular expression to start " "collecting table in a file [default=%default]") parser.add_option("--regex-end", dest="regex_end", type="string", help="regular expression to end collecting " "table in a file [default=%default]") parser.add_option("--test", dest="test", type="int", help="test combining tables with " "first X rows [default=%default]") parser.set_defaults( input_has_titles=True, skip_titles=False, missing_value="na", headers=None, sort=None, glob=None, columns="1", sort_keys=False, merge=False, ignore_empty=True, regex_start=None, regex_end=None, add_file_prefix=False, use_file_prefix=False, cat=None, take=[], regex_filename="(.*)", prefixes=None, test=0, ) (options, args) = E.start(parser, argv=argv) if options.headers: if "," in options.headers: options.headers = options.headers.split(",") else: options.headers = re.split("\s+", options.headers.strip()) if options.sort and options.sort not in ("numeric", "alphabetic"): if "," in options.sort: options.sort = options.sort.split(",") else: options.sort = re.split("\s+", options.sort) if options.merge: options.columns = [] else: options.columns = [int(x) - 1 for x in options.columns.split(",")] options.filenames = [] if options.glob: options.filenames += glob.glob(options.glob) options.filenames += args if len(options.filenames) < 1: raise ValueError("no tables found.") E.info("combining %i tables" % len(options.filenames)) if options.cat: concatenate_tables(options.stdout, options, args) else: join_tables(options.stdout, options, args) E.stop()
def join_tables(outfile, options, args): '''join tables.''' if options.headers and options.headers[0] != "auto" and \ len(options.headers) != len(options.filenames): raise ValueError("number of provided headers (%i) " "is not equal to number filenames (%i)." % (len(options.headers), len(options.filenames))) tables = [] keys = {} sorted_keys = [] sizes = {} if options.merge: titles = ["count"] else: titles = [] headers_to_delete = [] if options.prefixes: prefixes = [x.strip() for x in options.prefixes.split(",")] if len(prefixes) != len(options.filenames): raise ValueError( ("number of prefixes (%i) and tables (%i) " "do not match") % (len(prefixes), len(options.filenames))) else: prefixes = None E.debug("joining on columns %s and taking columns %s" % (options.columns, options.take)) for nindex, filename in enumerate(options.filenames): E.info("processing %s (%i/%i)" % (filename, nindex + 1, len(options.filenames))) prefix = os.path.basename(filename) lines = read_table(filename, options) # skip (or not skip) empty tables if len(lines) == 0 and options.ignore_empty: E.warn("%s is empty - skipped" % filename) headers_to_delete.append(nindex) continue table = {} sizes = {} max_size = 0 ncolumns = 0 if options.input_has_titles: data = lines[0][:-1].split("\t") # no titles have been defined so far if not titles: key = "-".join([data[x] for x in options.columns]) titles = [key] # set take based on column titles or numerically if options.take: take = [] # convert numeric columns for filtering for x in options.take: try: take.append(int(x) - 1) except ValueError: # will raise error if x is not present take.append(data.index(x)) else: # tables with max 100 columns take = None for x in range(len(data)): if x in options.columns or (take and x not in take): continue ncolumns += 1 if options.add_file_prefix: try: p = re.search(options.regex_filename, prefix).groups()[0] except AttributeError: E.warn("can't extract title from filename %s" % prefix) p = "unknown" titles.append("%s_%s" % (p, data[x])) elif options.use_file_prefix: try: p = re.search(options.regex_filename, prefix).groups()[0] except: E.warn("can't extract title from filename %s" % prefix) p = "unknown" titles.append("%s" % p) elif prefixes: titles.append("%s_%s" % (prefixes[nindex], data[x])) else: titles.append(data[x]) del lines[0] else: # set take based on numeric columns if no titles are present if options.take: take = [] # convert numeric columns for filtering for x in options.take: take.append(int(x) - 1) else: # tables with max 100 columns take = None # IMS: We might still want filename titles even if the input # columns don't have titles. if options.add_file_prefix: if not titles: titles = ["ID"] try: p = re.search(options.regex_filename, prefix).groups()[0] except AttributeError: E.warn("can't extract title from filename %s" % prefix) p = "unknown" titles.append("%s_%s" % (p, data[x])) elif options.use_file_prefix: if not titles: titles = ["ID"] try: p = re.search(options.regex_filename, prefix).groups()[0] except: E.warn("can't extract title from filename %s" % prefix) p = "unknown" titles.append("%s" % p) ncolumns = 1 n = 0 for line in lines: data = line[:-1].split("\t") try: row_keys = [data[x] for x in options.columns] except IndexError as msg: raise IndexError("error while parsing %s: %s" % (filename, msg)) if options.sort_keys: if options.sort_keys == "numeric": row_keys.sort(lambda x, y: cmp(float(x), float(y))) else: row_keys.sort() if options.merge: key = n else: key = "-".join(row_keys) if key not in keys: sorted_keys.append(key) keys[key] = 1 sizes[key] = 0 if take: max_size = len(take) table[key] = [data[x] for x in take] else: max_size = max(len(data) - len(options.columns), max_size) table[key] = [ data[x] for x in range(0, len(data)) if x not in options.columns ] n += 1 # enter columns of "na" for empty tables. if max_size == 0: max_size = ncolumns tables.append((max_size, table)) # delete in reverse order if options.headers: for nindex in headers_to_delete[::-1]: del options.headers[nindex] if len(tables) == len(titles) - 1: if options.headers: headers = ["bin"] if options.headers[0] == 'auto': for t in range(len(tables)): headers.append(os.path.basename(options.filenames[t])) headers += [""] * (tables[t][0] - 1) else: for t in range(len(tables)): headers.append(options.headers[t]) headers += [""] * (tables[t][0] - 1) # use headers as titles, if headers is given and skip-titles is # turned on if options.input_has_titles and options.skip_titles: titles = headers else: # otherwise: print the headers out right away outfile.write("\t".join(headers) + "\n") order = list(range(0, len(tables) + 1)) if options.input_has_titles or \ (options.use_file_prefix or options.add_file_prefix): if options.sort: sort_order = [] if options.sort == "numeric": t = list( zip(list(map(int, titles[1:])), list(range(1, len(titles) + 1)))) t.sort() for tt in t: sort_order.append(titles[tt[1]]) elif options.sort == "alphabetical": t = list(zip(titles[1:], list(range(1, len(titles) + 1)))) t.sort() for tt in t: sort_order.append(titles[tt[1]]) else: sort_order = options.sort map_title2pos = {} for x in range(1, len(titles)): map_title2pos[titles[x]] = x order = [ 0, ] for x in sort_order: if x in map_title2pos: order.append(map_title2pos[x]) else: order = list(range(0, len(titles))) outfile.write("\t".join( [titles[order[x]] for x in range(len(titles))])) outfile.write("\n") if options.sort_keys: if options.sort_keys: if options.sort_keys == "numeric": sorted_keys.sort(lambda x, y: cmp(float(x), float(y))) else: sorted_keys.sort() for key in sorted_keys: outfile.write("%s" % key) for x in order[1:]: max_size, table = tables[x - 1] c = 0 if key in table: outfile.write("\t") outfile.write("\t".join(table[key])) c = len(table[key]) assert (max_size == 1) outfile.write("\t%s" % options.missing_value * (max_size - c)) outfile.write("\n") else: # for multi-column table, just write if options.input_has_titles: outfile.write("\t".join([titles[x] for x in range(len(titles))])) outfile.write("\n") for key in sorted_keys: outfile.write("%s" % key) for x in range(len(tables)): max_size, table = tables[x] c = 0 if key in table: outfile.write("\t") outfile.write("\t".join(table[key])) c = len(table[key]) outfile.write("\t%s" % options.missing_value * (max_size - c)) outfile.write("\n")
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-i", "--input-bam", dest="input_bam_file", type="string", help="input bam file") parser.add_option( "-f", "--reference-bam", dest="reference_bam_file", type="string", help="reference BAM file [%default]") parser.add_option( "-q", "--query-name-regex", dest="query_name_regex", type="string", help="regular expression to apply on query name. " "Potentially required to match samtools sort order and should " "evaluate to an integer [%default]") parser.set_defaults( input_bam_file=None, reference_bam_file=None, query_name_regex=None, ) (options, args) = E.start(parser, argv, add_output_options=True) if len(args) == 2: options.input_bam_file = args[0] options.reference_bam_file = args[1] if options.input_bam_file is None: raise ValueError("please supply a BAM file as input") if options.reference_bam_file is None: raise ValueError("please supply a BAM file as reference") # update paths to absolute options.input_bam_file = os.path.abspath(options.input_bam_file) options.reference_bam_file = os.path.abspath(options.reference_bam_file) if not os.path.exists(options.input_bam_file): raise OSError("input bam file {} does not exist".format( options.input_bam_file)) if not os.path.exists(options.reference_bam_file): raise OSError("reference bam file {} does not exist".format( options.reference_bam_file)) bam_in = pysam.AlignmentFile(options.input_bam_file) ref_in = pysam.AlignmentFile(options.reference_bam_file) outf_mapped = E.open_output_file("mapped") outf_mapped.write("\t".join( ["read", "length", "status", "overlap", "comp_contig", "comp_start", "comp_end", "ref_contig", "ref_start", "ref_end", "shared_misaligned", "shared_aligned", "shared_insertion", "shared_deletion", "comp_aligned", "comp_insertion", "comp_deletion", "ref_aligned", "ref_insertion", "ref_deletion"]) + "\n") outf_missing = E.open_output_file("missing") outf_missing.write("\t".join( ["read", "length", "status", "aligned", "insertion", "deletion"]) + "\n") counter = E.Counter() if options.query_name_regex: rx = re.compile(options.query_name_regex) def extract_query(x): return int(rx.search(x).groups()[0]) qname_fn = None if options.query_name_regex: qname_fn = extract_query for reads_cmp, read_ref in group_pairs(iterate_read_pairs( bam_in.fetch(until_eof=True), ref_in.fetch(until_eof=True), qname_fn=qname_fn)): if len(reads_cmp) == 0: counter.missing += 1 pairs_ref = set(read_ref.get_aligned_pairs()) outf_missing.write("\t".join( map(str, ( read_ref.query_name, read_ref.query_length, "missing") + count_pairs(pairs_ref))) + "\n") continue if len(reads_cmp) > 1: # multiple matches counter.multi_mapping += 1 prefix = "multi_" else: counter.unique_mapping += 1 prefix = "unique_" is_mapped = False for read_cmp in reads_cmp: counter.paired += 1 if read_cmp.is_unmapped: counter.unmapped += 1 pairs_ref = set(read_ref.get_aligned_pairs()) outf_missing.write("\t".join( map(str, ( read_ref.query_name, read_ref.query_length, "unmapped") + count_pairs(pairs_ref))) + "\n") continue overlap = max(0, (min(read_cmp.reference_end, read_ref.reference_end) - max(read_cmp.reference_start, read_ref.reference_start))) pairs_cmp = set(read_cmp.get_aligned_pairs()) pairs_ref = set(read_ref.get_aligned_pairs()) shared_cmp = pairs_cmp.intersection(pairs_ref) unique_cmp = pairs_cmp.difference(pairs_ref) missaligned = len([x for x, y in unique_cmp if x is not None and y is not None]) if read_cmp.reference_name != read_ref.reference_name or \ overlap == 0: status = "mismapped" else: counter.overlap += 1 status = "mapped" is_mapped = True outf_mapped.write("\t".join( map(str, (read_cmp.query_name, read_cmp.query_length, prefix + status, overlap, read_cmp.reference_name, read_cmp.reference_start, read_cmp.reference_end, read_ref.reference_name, read_ref.reference_start, read_ref.reference_end, missaligned) + count_pairs(shared_cmp) + count_pairs(pairs_cmp) + count_pairs(pairs_ref))) + "\n") else: if is_mapped: status = "mapped" else: status = "mismapped" counter[prefix + status] += 1 with E.open_output_file("summary") as outf: outf.write("category\tcounts\n") outf.write(counter.asTable() + "\n") E.stop()
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-b", "--reference-bed-file", dest="reference_bed_file", type="string", help="reference bed file " "[%default]") parser.add_option("-m", "--method", dest="method", type="choice", choices=("lvc-comparison", ), help="methods to apply [%default]") parser.set_defaults(method="lvc-comparison", reference_fasta_file=None, input_bed_file=None, size_bins=(1000, 10000, 100000), output_sets=True, region_string=None) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) reference_set = collections.defaultdict(quicksect.IntervalTree) E.info("reading reference bed file from {}".format( options.reference_bed_file)) with IOTools.open_file(options.reference_bed_file) as inf: for record in pysam.tabix_iterator(inf, pysam.asBed()): mm = reference_set[record.contig] mm.add(record.start, record.end) E.info("read reference intervals on {} contigs: {}".format( len(list(reference_set.keys())), ",".join(list(reference_set.keys())))) if options.output_sets: output_tp = E.open_output_file("tp") output_fp = E.open_output_file("fp") output_fn = E.open_output_file("fn") else: output_tp = None output_fp = None output_fn = None if options.method == "lvc-comparison": c = E.Counter() found = set() counts = {} names = set() nsize_bins = len(options.size_bins) for bin in range(len(options.size_bins) + 1): counts[bin] = dict([(x, collections.defaultdict(int)) for x in ("tp", "fn", "fp", "test", "truth")]) for record in pysam.tabix_iterator(options.stdin, pysam.asBed()): if record.contig not in reference_set: c.ignored_no_contig += 1 continue c.test += 1 matches = reference_set[record.contig].search( record.start, record.end) size = record.end - record.start bin = get_size_bin(size, options.size_bins) if len(matches) == 0: c.fp += 1 status = "fp" if output_fp: output_fp.write(str(record) + "\n") elif len(matches) >= 1: c.tp += 1 status = "tp" if output_tp: output_tp.write(str(record) + "\n") # todo: overlap criteria # record found for match in matches: found.add((record.contig, match.start, match.end)) name = record.name.split(",")[0] names.add(name) counts[bin]["test"][name] += 1 counts[bin][status][name] += 1 outf = options.stdout with IOTools.open_file(options.reference_bed_file) as inf: for record in pysam.tabix_iterator(inf, pysam.asBed()): c.truth += 1 bin = get_size_bin(record.end - record.start, options.size_bins) counts[bin]["truth"]["all"] += 1 key = (record.contig, record.start, record.end) if key not in found: c.fn += 1 counts[bin]["fn"]["all"] += 1 outf.write("\t".join(("category", "size", "test", "tp", "fp", "truth", "fn")) + "\n") for name in sorted(names): for bin in range(len(options.size_bins) + 1): if bin == len(options.size_bins): size_bin = ">={}".format(options.size_bins[-1]) else: size_bin = "<{}".format(options.size_bins[bin]) outf.write("\t".join( map(str, ( name, size_bin, counts[bin]["test"][name], counts[bin]["tp"][name], counts[bin]["fp"][name], counts[bin]["truth"]["all"], counts[bin]["fn"]["all"], ))) + "\n") E.info(str(c)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--min-overlap", dest="min_overlap", type="float", help="minimum overlap [%default]") parser.add_option("-a", "--bam-file", dest="filename_bam", metavar="bam", type="string", help="bam-file to use (required) [%default]") parser.add_option("-b", "--bed-file", dest="filename_bed", metavar="bed", type="string", help="bed-file to use (required) [%default]") parser.add_option("-s", "--sort-bed", dest="sort_bed", action="store_true", help="sort the bed file by chromosomal location before " "processing. " "[%default]") parser.add_option( "--assume-sorted", dest="sort_bed", action="store_false", help="assume that the bed-file is sorted by chromosomal location. " "[%default]") parser.add_option( "--split-intervals", dest="split_intervals", action="store_true", help="treat split BAM intervals, for example spliced intervals, " "as separate intervals. Note that a single alignment might be " "counted several times as a result. " "[%default]") parser.set_defaults( min_overlap=0.5, filename_bam=None, filename_bed=None, sort_bed=True, split_intervals=False, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) filename_bam = options.filename_bam filename_bed = options.filename_bed if filename_bam is None and filename_bed is None: if len(args) != 2: raise ValueError( "please supply a bam and a bed file or two bed-files.") filename_bam, filename_bed = args if filename_bed is None: raise ValueError("please supply a bed file to compare to.") if filename_bam is None: raise ValueError("please supply a bam file to compare with.") E.info("intersecting the two files") min_overlap = options.min_overlap options.stdout.write("category\talignments\n") # get number of columns of reference bed file for bed in Bed.iterator(IOTools.open_file(filename_bed)): ncolumns_bed = bed.columns break E.info("assuming %s is bed%i format" % (filename_bed, ncolumns_bed)) if ncolumns_bed < 4: raise ValueError("please supply a name attribute in the bed file") # get information about if filename_bam.endswith(".bam"): format = "-abam" samfile = pysam.AlignmentFile(filename_bam, "rb") total = samfile.mapped # latest bedtools uses bed12 format when bam is input ncolumns_bam = 12 # count per read sort_key = lambda x: x.name else: format = "-a" total = IOTools.get_num_lines(filename_bam) # get bed format ncolumns_bam = 0 for bed in Bed.iterator(IOTools.open_file(filename_bam)): ncolumns_bam = bed.columns break if ncolumns_bam > 0: E.info("assuming %s is bed%i fomat" % (filename_bam, ncolumns_bam)) if ncolumns_bam == 3: # count per interval sort_key = lambda x: (x.contig, x.start, x.end) else: # count per interval category sort_key = lambda x: x.name # use fields for bam/bed file (regions to count with) data_fields = [ "contig", "start", "end", "name", "score", "strand", "thickstart", "thickend", "rgb", "blockcount", "blockstarts", "blockends" ][:ncolumns_bam] # add fields for second bed (regions to count in) data_fields.extend([ "contig2", "start2", "end2", "name2", "score2", "strand2", "thickstart2", "thickend2", "rgb2", "blockcount2", "blockstarts2", "blockends2" ][:ncolumns_bed]) # add bases overlap data_fields.append("bases_overlap") data = collections.namedtuple("data", data_fields) options.stdout.write("total\t%i\n" % total) if total == 0: E.warn("no data in %s" % filename_bam) return # SNS: sorting optional, off by default if options.sort_bed: bedcmd = "<( gunzip < %s | sort -k1,1 -k2,2n)" % filename_bed else: bedcmd = filename_bed if options.split_intervals: split = "-split" else: split = "" # IMS: newer versions of intersectBed have a very high memory # requirement unless passed sorted bed files. statement = """bedtools intersect %(format)s %(filename_bam)s -b %(bedcmd)s %(split)s -sorted -bed -wo -f %(min_overlap)f""" % locals() E.info("starting counting process: %s" % statement) proc = E.run(statement, return_popen=True, stdout=subprocess.PIPE) E.info("counting") counts_per_alignment = collections.defaultdict(int) take_columns = len(data._fields) def iterate(infile): for line in infile: if not line.strip(): continue yield data._make(line[:-1].split()[:take_columns]) for read, overlaps in itertools.groupby(iterate( IOTools.force_str(proc.stdout)), key=sort_key): annotations = [x.name2 for x in overlaps] for anno in annotations: counts_per_alignment[anno] += 1 for key, counts in sorted(counts_per_alignment.items()): options.stdout.write("%s\t%i\n" % (key, counts)) # write footer and output benchmark information. E.stop()
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-a", "--as-gtf", dest="as_gtf", action="store_true", help="output as gtf.") parser.add_option( "-f", "--id-format", dest="id_format", type="string", help="format for numeric identifier if --as-gtf is set and " "no name in bed file [%default].") parser.set_defaults(as_gtf=False, id_format="%08i", test=None) (options, args) = E.start(parser, add_pipe_options=True) as_gtf = options.as_gtf id_format = options.id_format if as_gtf: gff = GTF.Entry() else: gff = GTF.Entry() gff.source = "bed" gff.feature = "exon" ninput, noutput, nskipped = 0, 0, 0 id = 0 for bed in Bed.iterator(options.stdin): ninput += 1 gff.contig = bed.contig gff.start = bed.start gff.end = bed.end if bed.fields and len(bed.fields) >= 3: gff.strand = bed.fields[2] else: gff.strand = "." if bed.fields and len(bed.fields) >= 2: gff.score = bed.fields[1] if as_gtf: if bed.fields: gff.gene_id = bed.fields[0] gff.transcript_id = bed.fields[0] else: id += 1 gff.gene_id = id_format % id gff.transcript_id = id_format % id else: if bed.fields: gff.source = bed.fields[0] options.stdout.write(str(gff) + "\n") noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) E.stop()
def main(argv=None): if not argv: argv = sys.argv # get the options parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-c", "--chain-file", dest="chainfile", type="string", help="the chain file to analyse", metavar="FILE") parser.add_option( "--alignments-per-contig", dest="nperchrom", type="int", help="Number of aligments to report on per chromosome pair", default=2) parser.add_option( "--aggregate-by", dest="aggregate", type="choice", choices=("contig", "none"), help="Set to `contig` to perform per chromosome pair analysis", default=2) parser.add_option( "-i", "--output-identity", dest="output_identity", action="store_true", help="Generate stats on the sequence identity of the gapped " "chains. Requires FastaIndex.py", default=False) parser.add_option("-d", "--dbpath", dest="dbpath", type="string", help="The path to the indexed fasta files", default=".") parser.add_option("-t", "--target-genome", dest="targetgenome", type="string", help="The target genome, eg. Mm19", default=False) parser.add_option("-q", "--query-genome", dest="querygenome", type="string", help="The query genome eg. Hg17", default=False) parser.add_option( "-e", "--errors", dest="errors", action="store_true", help="Check chains for erroneous contig sizes using the given db", default=False) parser.add_option("-r", "--output-report", dest="output_report", action="store_true", help="Write out tab-delimited reports for each analysis", default=False) (options, args) = E.start(parser, argv=argv, add_output_options=True) # make a list of counting objects counters = [] counters.append(CounterPerChromosome(gapped=True)) counters.append(CounterPerChromosome(gapped=False)) if options.aggregate == "contig": counters.append(CounterPerChromosomePair(gapped=True)) counters.append(CounterPerChromosomePair(gapped=False)) counters.append(CounterOfGappedChainLengths(gapped=True)) counters.append(CounterOfGappedChainLengths(gapped=False)) if options.output_identity is True: if options.targetgenome == 0 or options.querygenome == 0: raise Exception( "Target and query database must be specified with the \"-e\" flag" ) t_db_path = os.path.join(options.dbpath, options.targetgenome) q_db_path = os.path.join(options.dbpath, options.querygenome) counters.append(CounterPercentIdentify(t_db_path, q_db_path)) if options.errors is True: if options.targetgenome == 0 or options.querygenome == 0: raise Exception( "Target and query database must be specified with the \"-e\" flag" ) counters.append(CounterOfErrors(options)) # iterate over the chains and counters for chain in chain_iterator(options.stdin): c = Chain(chain) for counter in counters: counter.add(c) # write a report to stdout and individual reports to tab delimited files options.stdout.write( "\n\n********** chain2stats report starts **********\n") for counter in counters: counter.report(options) if options.output_report is True: counter.tabbed_report(options, E) options.stdout.write("\n********** chain2stats report ends **********\n\n") E.stop()
def _iterate(self): """iterate over muliple files.""" def _iter(infile): identifier = None is_new = False for line in infile: if line.startswith("#"): continue if line.startswith(">"): if self.regexIdentifier: try: identifier = re.search(self.regexIdentifier, line[1:-1]).groups()[0] except AttributeError: raise ValueError( "could not parse identifier from line %s " "- check the input" % line[1:-1]) else: identifier = re.split("\s", line[1:-1])[0] is_new = True else: if not identifier: raise ValueError( "refusing to emit sequence without identifier " "- check the input") yield is_new, identifier, line.strip() is_new = False for filename in self.filenames: if self.format == "tar.gz" or self.format == "tar" or \ (self.format == "auto" and filename.endswith("tar.gz")): if filename == "-": tf = tarfile.open(fileobj=sys.stdin.buffer, mode="r|*") else: tf = tarfile.open(filename, mode="r") for f in tf: b, ext = os.path.splitext(f.name) if ext.lower() in (".fasta", ".fa"): E.info("extracting %s" % f.name) if sys.version_info.major >= 3: infile = io.TextIOWrapper(tf.extractfile(f), encoding="ascii") else: infile = tf.extractfile(f) for x in _iter(infile): yield x else: E.info("skipping %s" % f.name) if tf != sys.stdin: tf.close() continue elif self.format == "fasta.gz" or (self.format == "auto" and filename.endswith(".gz")): infile = IOTools.open_file(filename, "r") elif filename == "-": infile = sys.stdin else: infile = IOTools.open_file(filename, "r") for x in _iter(infile): yield x if filename != "-": infile.close() raise StopIteration
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: set_diff.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-p", "--add-percent", dest="add_percent", action="store_true", help="add percentage information to each line.") parser.add_option("-t", "--header-names", dest="headers", type="string", help="comma separated list of headers. If empty or set to '-', filenames are used.") parser.add_option("--skip-header", dest="add_header", action="store_false", help="do not add header to flat format.") parser.add_option("--output-with-header", dest="write_header", action="store_true", help="write header and exit.") parser.add_option("--with-title", dest="with_title", action="store_true", help="use column titles in input data [%default].") parser.add_option("--no-title", dest="with_title", action="store_false", help="there are no titles in input data [%default].") parser.set_defaults( add_percent=False, percent_format="%5.2f", headers=None, add_header=True, write_header=False, with_title=True, ) (options, args) = E.start(parser) if options.add_header: options.stdout.write( "set1\tset2\tn1\tn2\tunion\tinter\tunique1\tunique2") if options.add_percent: options.stdout.write( "\tpinter\tpunique1\tpunique2\tpcov1\tpcov2\tpcovmax") options.stdout.write("\n") if options.write_header: sys.exit(0) if len(args) < 2: raise ValueError("please supply at least two filenames.") headers, titles, sets = [], [], [] if options.headers: if options.headers == "-": headers = args else: headers = options.headers.split(",") if len(headers) != len(args): raise ValueError( "please supply the same number of headers as there are filenames.") for f in args: if options.with_title: title, data = IOTools.readList( IOTools.open_file(f, "r"), with_title=options.with_title) titles.append(title) else: data = IOTools.readList(open(f, "r")) sets.append(set(data)) if not headers and titles: headers = titles else: headers = args for x in range(len(sets) - 1): set1 = sets[x] for y in range(x + 1, len(sets)): set2 = sets[y] l1, l2 = len(set1), len(set2) options.stdout.write("%s\t%s\t%i\t%i\t%i\t%i\t%i\t%i" % (headers[x], headers[y], l1, l2, len(set1.union( set2)), len(set1.intersection( set2)), len(set1.difference( set2)), len(set2.difference(set1)))) if options.add_percent: if len(set1) == 0: ri, r1, r2 = 0, 1, 0 c1, c2, cm = 1, 0, 0 elif len(set2) == 0: ri, r1, r2 = 0, 0, 1 c1, c2, cm = 0, 1, 0 else: i = len(set1.intersection(set2)) ri, r1, r2 = ( i / float(len(set1.union(set2))), len(set1.difference(set2)) / float(l1), len(set2.difference(set1)) / float(l2)) c1, c2 = (i / float(l1), i / float(l2)) cm = max(c1, c2) options.stdout.write( "\t" + ("\t".join([options.percent_format for z in range(6)])) % (ri, r1, r2, c1, c2, cm)) options.stdout.write("\n") E.stop()
def runCommand(data): filename, cmd, options, tmpdir, subdirs = data if subdirs: outdir = "%s.dir/" % (filename) os.mkdir(outdir) cmd = re.sub("%DIR%", outdir, cmd) x = re.search("'--log=(\S+)'", cmd) or re.search("'--L\s+(\S+)'", cmd) if x: logfile = filename + ".log" cmd = cmd[:x.start()] + "--log=%s" % logfile + cmd[x.end():] else: logfile = filename + ".out" # working directory - needs to be the one from which the # the script is called to resolve input files. cwd = os.getcwd() if "<(" in cmd or "|" in cmd: if "'" in cmd: raise ValueError( "advanced bash syntax `<()` combined with single quotes") cmd = """/bin/bash -c '%s'""" % cmd if "|" in cmd: if r"\|" not in cmd: E.warn("pipes (`|`) within command need to be escaped, " "otherwise jobs run on submit host") c = '%s -v "BASH_ENV=%s" -q %s -p %i %s %s' % ( options.cluster_cmd, options.bashrc, options.cluster_queue, options.cluster_priority, options.cluster_options, cmd) iteration = 0 while 1: iteration += 1 if iteration > 1: E.info("%s: re-submitting command (repeat=%i): %s" % (filename, iteration, c)) else: E.info("%s: submitting command: %s" % (filename, c)) infile = IOTools.openFile(filename, "r") outfile = IOTools.openFile(filename + ".out", "w") errfile = IOTools.openFile(filename + ".err", "a") retcode = subprocess.call(c, shell=True, stdin=infile, stdout=outfile, stderr=errfile, cwd=cwd, close_fds=True) infile.close() outfile.close() errfile.close() if hasFinished(retcode, filename, options.output_tag, logfile): break if iteration > options.resubmit: E.warn("%s: giving up executing command: retcode=%i" % (filename, retcode)) break E.warn("%s: error while executing command: retcode=%i" % (filename, retcode)) return (retcode, filename, cmd, logfile, iteration)
def build_report(): '''build report from scratch.''' E.info("starting documentation build process from scratch") P.run_report(clean=True)
def getOptionParser(): """create parser and add options.""" parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("--split-at-lines", dest="split_at_lines", type="int", help="split jobs according to line number [%default].") parser.add_option( "--split-at-column", dest="split_at_column", type="int", help="split jobs according to column. Columns start at number 1 " "and the input should be sorted by this column [%default].") parser.add_option( "--group-by-regex", dest="group_by_regex", type="string", help="group jobs according to a regular expression [%default].") parser.add_option( "--split-at-regex", dest="split_at_regex", type="string", help="split jobs according to a regular expression [%default].") parser.add_option("--split-at-tag", dest="split_at_tag", type="int", help="split a file at a tag [%default].") parser.add_option( "--chunk-size", dest="chunksize", type="int", help="when splitting at regex or tag, aggregate x entries [%default].") parser.add_option( "--debug", dest="debug", action="store_true", help="debug mode. Do not delete temporary file [%default].") parser.add_option( "--dry-run", dest="dry_run", action="store_true", help="dry run. Do not split input and simply forward stdin to stdout. " "Useful for debugging the command [%default].") parser.add_option("--input-header", dest="input_header", action="store_true", help="The input stream contains a table header. " "This header is replicated for each job [%default].") parser.add_option( "--output-header", dest="output_header", action="store_true", help="The output jobs contain a table header. " "The header is removed for each job except for the first [%default].") parser.add_option( "--output-regex-header", dest="output_regex_header", type="string", help="Regular expression for header (in stdout stream). Any lines " "before the first line matching this regular expression are ignored" "[%default].") parser.add_option( "--output-tag", dest="output_tag", type="string", help="The output jobs contain a tag in the last line denoting " "job completion. If the unix return value denotes an error, the " "presence of this tag is checked [%default].") parser.add_option( "--subdirs", dest="subdirs", action="store_true", help="Run within separate subdirs for jobs. This permits " "multiple output streams. Use a placeholder %DIR% if you supply " "the ouput pattern as a command line option [%default].") parser.add_option( "-T", "--temp-dir", dest="tmpdir", type="string", help="Temporary directory to be used. Default is the current " "directory [%default].") parser.add_option("--max-files", dest="max_files", type="int", help="create at most x files [%default].") parser.add_option( "--max-lines", dest="max_lines", type="int", help="in addition to splitting into chunksize, also split if " "more than max-lines is reached [%default].") parser.add_option( "--renumber", dest="renumber", type="string", help="renumber ids consecutively, supply a pattern [%default].") parser.add_option( "--renumber-column", dest="renumber_column", type="string", action="append", help="specify column to renumber. The format is regex:column, " "for example csv:1 or csv:id [%default].") parser.add_option( "-r", "--reduce", dest="reduce", type="string", action="append", help="Add reduce functions for specific files. The format is " "file:reducer. The default reducer is 'table' for all files " "[%default].") parser.add_option( "-m", "--map", dest="map", type="string", action="append", help="Map specific columns in tables. The format is " "file:column:pattern, for example .table:1:%06i [%default].") parser.add_option("--resume", dest="resume", type="string", help="resume aborted run from files in dir [%default]") parser.add_option("--collect", dest="collect", type="string", help="collect files in dir and process as normally " "[%default]") parser.add_option("--is-binary", dest="binary", action="store_true", help="the output is binary - files are concatenated " "without parsing [%default]") parser.add_option( "--resubmit", dest="resubmit", type="int", help="if a job fails, automatically resubmit # times. Set to 0 " "in order to disable resubmission [%default]") parser.add_option("--fail", dest="resubmit", action="store_false", help="if a job fails, do not resubmit [%default]") parser.add_option("--bashrc", dest="bashrc", type="string", help="bashrc file to use [%default]") parser.add_option("--method", dest="method", type="choice", choices=("multiprocessing", "threads", "drmaa"), help="method to submit jobs [%default]") parser.add_option("--job-memory", dest="job_memory", type="string", help="per-job memory requirement." "Unit must be specified, eg. 100M, 1G ") parser.add_option( "-e", "--env", dest="environment", type="string", action="append", help="environment variables to be passed to the jobs [%default]") parser.add_option( "--output-filename-pattern", dest="output_pattern", type="string", help="Pattern for secondary output filenames. Should contain a '%s' " "[%default].") parser.set_defaults( split_at_lines=None, split_at_column=None, split_at_regex=None, group_by_regex=None, split_at_tag=None, chunksize=100, cluster_cmd='qrsh -cwd -now n', bashrc="~/.bashrc", input_header=False, output_header=False, output_regex_header=None, debug=False, dry_run=False, tmpdir="./", subdirs=False, renumber=None, output_tag="# job finished", map=[], reduce=[], resume=None, renumber_column=[], resubmit=5, collect=None, method="drmaa", job_memory=None, max_files=None, max_lines=None, binary=False, environment=[], output_pattern="%s", ) # stop parsing options at the first argument parser.disable_interspersed_args() return parser
def update_report(): '''update report.''' E.info("updating documentation") P.run_report(clean=False)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id: r_mann_whitney_u.py 2782 2009-09-10 11:40:29Z andreas $") parser.add_option("-m", "--method", dest="method", type="string", help="method to use [ks=Kolmogorov-Smirnov,mwu=Mann-WhitneyU]") parser.add_option("-a", "--hardcopy", dest="hardcopy", type="string", help="write hardcopy to file.", metavar="FILE") parser.add_option("-1", "--infile1", dest="filename_input1", type="string", help="input filename for distribution 1.") parser.add_option("-2", "--infile2", dest="filename_input2", type="string", help="input filename for distribution 2.") parser.add_option("-p", "--infile-map", dest="filename_input_map", type="string", help="input filename for mapping categories to values.") parser.set_defaults( method="ks", filename_input1=None, filename_input2=None, filename_input_map=None, ) (options, args) = E.start(parser, add_pipe_options=True) map_category2value = {} if options.filename_input_map: map_category2value = IOTools.ReadMap(open(options.filename_input_map, "r"), map_functions=(str, float)) values1, errors1 = IOTools.ReadList(open(options.filename_input1, "r"), map_category=map_category2value) values2, errors2 = IOTools.ReadList(open(options.filename_input2, "r"), map_category=map_category2value) E.info("ninput1=%i, nerrors1=%i, ninput2=%i, nerrors2=%i" % (len(values1), len(errors1), len(values2), len(errors2))) if options.hardcopy: R.png(options.hardcopy, width=1024, height=768) if options.method == "ks": result = R.ks_test(values1, values2) elif options.method == "mwu": result = R.wilcox_test(values1, values2, paired=False) R.assign("v1", values1) R.assign("v2", values2) R.layout(R.matrix((1, 2, 3, 4), 2, 2, byrow=True)) R.boxplot(values1, values2, col=('white', 'red'), main="Boxplot") R("""qqplot( v1, v2, main ='Quantile-quantile plot' ); lines( c(0,1), c(0,1) );""") R("""hist( v1, freq=FALSE, width=0.5, density=10, main='Relative frequency histogram')""") R("""hist( v2, freq=FALSE, add=TRUE, width=0.5, col='red', offset=0.5, density=20, angle=135)""") R("""hist( v1, freq=TRUE, width=0.5, density=10, main='Absolute frequency histogram')""") R("""hist( v2, freq=TRUE, add=TRUE, width=0.5, col='red', offset=0.5, density=20, angle=135)""") print("## Results for %s" % result['method']) for x in ['p.value', 'statistic', 'alternative', 'method']: print(x, result[x]) E.stop()
def main(argv=None): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-f", "--fasta", dest="input_filename_fasta", type="string", help="filename with fasta sequences. ") parser.add_option( "-o", "--output-filename-sequences", dest="output_filename_sequences", type="string", help="output per sequence information to filename") parser.set_defaults( input_filename_fasta=None, ) (options, args) = E.start(parser, argv=argv) if len(args) > 0: options.input_filename_fasta = args[0] sequence_pairs = [] if options.input_filename_fasta != "-" and os.path.exists( options.input_filename_fasta + ".fai"): has_index = 1 fastafile = pysam.FastaFile(options.input_filename_fasta) sequence_pairs = list(zip(fastafile.references, fastafile.lengths)) else: has_index = 0 iterator = pysam.FastxFile(options.input_filename_fasta) for record in iterator: sequence_pairs.append( (record.name, len(record.sequence))) lengths = numpy.array([x[1] for x in sequence_pairs]) options.stdout.write("\t".join(( "has_index", "nsequences", "total_length", "min_length", "max_length", "median_length", "mean_length")) + "\n") if len(lengths) > 0: options.stdout.write("\t".join(map(str, ( has_index, len(sequence_pairs), lengths.sum(), lengths.min(), lengths.max(), numpy.median(lengths), lengths.mean()))) + "\n") else: options.stdout.write("\t".join(map(str, ( has_index, len(sequence_pairs), 0, "", "", "", ""))) + "\n") if options.output_filename_sequences: with IOTools.open_file(options.output_filename_sequences, "w") as outf: outf.write("name\tlength\n") outf.write( "\n".join(["\t".join(map(str, x)) for x in sequence_pairs]) + "\n") E.stop()
def buildAlleles(sequence, variants, reference_start=0, phased=True): '''build alleles for ``sequence`` adding ``variants``. Variants are assumed to be in 0-based coordinates on the same strand as the sequence. ``reference_start`` is the position of the first base of ``sequence``. Set to 0, if the positions in ``variants`` are relative to ``sequence``. ''' def _delete(allele, del_start, del_end, variant, sequence, startoffset, endoffset, feature_start, feature_end): '''little helper: update ``allele`` with a deletion ``del_start:del_end``. ''' # truncate variant according to the feature variant = variant[startoffset:len(variant) - endoffset] n = variant.count("-") if n: if variant.startswith("-"): del_start += n variant = variant[n:] else: del_end -= n variant = variant[:-n] # due to gaps, the variant is not actually within the feauture if del_start >= del_end: return refseq = sequence[del_start:del_end].upper() assert refseq == variant, \ 'reference base mismatch at deletion: expected %s %s %s, got %s[%i:%i] at feature=%i-%i, variant=%i-%i, relative=%i-%i, del=%i-%i, action=%s' % \ (sequence[del_start - 10:del_start], refseq, sequence[del_end:del_end + 10], variant, startoffset, len(variant) - endoffset, feature_start, feature_end, var_start, var_end, rel_start, rel_end, del_start, del_end, action) l = del_end - del_start # assert len("".join(allele[del_start:del_end])) == l, \ # "deletion conflicts with other indels: " \ # "got %s[%i:%i] (ref=%s, allele=%s) at feature=%i-%i, variant=%i-%i, relative=%i-%i, del=%i-%i, action=%s" % \ # (variant, startoffset, len(variant)-endoffset, # refseq, str(allele[del_start:del_end]), # feature_start, feature_end, # var_start, var_end, # rel_start, rel_end, # del_start, del_end, # action) allele[del_start:del_end] = [""] * l allele1 = list(sequence.lower()) allele2 = list(sequence.lower()) if reference_start is None: feature_start = 0 else: feature_start = reference_start feature_end = feature_start + len(sequence) # main loop: insert variants into allele sequences for var_start, var_end, reference, action, has_wildtype, variantseqs in variants: # skip variants that are out-of-range if var_end <= feature_start or var_start >= feature_end: continue is_homozygous = len(variantseqs) == 1 and not has_wildtype rel_start, rel_end = var_start - feature_start, var_end - feature_start startoffset = max(0, feature_start - var_start) endoffset = max(0, var_end - feature_end) pruned_start, pruned_end = max(0, rel_start), min(len(sequence), rel_end) if action == "=": if E.global_options.loglevel >= 10: E.debug( "adding SNP at postition %i: reference=%s variants=%s" % (var_start, reference, variantseqs)) if allele1[rel_start] == "" or allele2[rel_start] == "": # these can be cases, where a base is deleted in one allele, # but recorded as a homozygous substitution in another allele. E.warn("substitution conflicts with a deletion - ignored: %s" % str((var_start, var_end, reference, action, has_wildtype, variantseqs))) continue assert rel_start >= 0 assert sequence[rel_start].upper() == reference, \ 'reference base mismatch: expected %s %s %s, got %s at feature=%i-%i, variant=%i-%i, relative=%i-%i, pruned=%i-%i, action=%s' % \ (sequence[rel_start - 10:rel_start], sequence[rel_start].upper(), sequence[rel_start + 1:rel_start + 10], reference, feature_start, feature_end, var_start, var_end, rel_start, rel_end, pruned_start, pruned_end, action) if phased: allele1[rel_start] = variantseqs[0] + allele1[rel_start][1:] allele2[rel_start] = variantseqs[1] + allele2[rel_start][1:] elif is_homozygous: allele1[rel_start] = variantseqs[0] + allele1[rel_start][1:] allele2[rel_start] = variantseqs[0] + allele2[rel_start][1:] else: if has_wildtype: if reference == variantseqs[0]: allele2[rel_start] = variantseqs[1] + allele2[ rel_start][1:] else: allele2[rel_start] = variantseqs[0] + allele2[ rel_start][1:] else: allele1[ rel_start] = variantseqs[0] + allele1[rel_start][1:] allele2[ rel_start] = variantseqs[1] + allele2[rel_start][1:] elif action == "-": if phased: _delete(allele1, pruned_start, pruned_end, variantseqs[0], sequence, startoffset, endoffset, feature_start, feature_end) _delete(allele2, pruned_start, pruned_end, variantseqs[1], sequence, startoffset, endoffset, feature_start, feature_end) elif is_homozygous: _delete(allele1, pruned_start, pruned_end, variantseqs[0], sequence, startoffset, endoffset, feature_start, feature_end) _delete(allele2, pruned_start, pruned_end, variantseqs[0], sequence, startoffset, endoffset, feature_start, feature_end) else: if has_wildtype: _delete(allele2, pruned_start, pruned_end, variantseqs[0], sequence, startoffset, endoffset, feature_start, feature_end) else: _delete(allele1, pruned_start, pruned_end, variantseqs[0], sequence, startoffset, endoffset, feature_start, feature_end) _delete(allele2, pruned_start, pruned_end, variantseqs[1], sequence, startoffset, endoffset, feature_start, feature_end) elif action == "+": # ignore insertions at position -1 if rel_start < 0: continue if phased: allele1[rel_start] += variantseqs[0].upper() allele2[rel_start] += variantseqs[1].upper() elif is_homozygous: allele1[rel_start] += variantseqs[0].upper() allele2[rel_start] += variantseqs[0].upper() else: if has_wildtype: allele2[rel_start] += variantseqs[0].upper() else: allele1[rel_start] += variantseqs[0].upper() allele2[rel_start] += variantseqs[1].upper() elif action == ">": # indel if rel_start >= 0: allele1[rel_start] += variantseqs[0].upper() _delete(allele2, pruned_start, pruned_end, variantseqs[1], sequence, startoffset, endoffset, feature_start, feature_end) elif action == "<": # delin if rel_start >= 0: allele2[rel_start] += variantseqs[1].upper() _delete(allele1, pruned_start, pruned_end, variantseqs[0], sequence, startoffset, endoffset, feature_start, feature_end) assert len(sequence) == len(allele1) assert len(sequence) == len(allele2) return (allele1, allele2)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-g", "--glob", dest="glob_pattern", type="string", help="glob pattern to use for collecting files [%default].") parser.add_option( "-f", "--file-pattern", dest="file_pattern", type="string", help="only check files matching this pattern [%default].") parser.add_option("-m", "--mode", dest="mode", type="choice", choices=("file", "node"), help="analysis mode [%default].") parser.add_option( "-r", "--recursive", action="store_true", help="recursively look for logfiles from current directory " "[%default].") parser.set_defaults( truncate_sites_list=0, glob_pattern="*.log", mode="file", recursive=False, ) (options, args) = E.Start(parser) if args: filenames = args elif options.glob_pattern: filenames = glob.glob(options.glob_pattern) if len(filenames) == 0: raise ValueError("no files to analyse") if options.mode == "file": totals = Logfile.LogFileData() options.stdout.write("file\t%s\n" % totals.getHeader()) for filename in filenames: if filename == "-": infile = sys.stdin elif filename[-3:] == ".gz": infile = gzip.open(filename, "r") else: infile = open(filename, "r") subtotals = Logfile.LogFileData() for line in infile: subtotals.add(line) infile.close() options.stdout.write("%s\t%s\n" % (filename, str(subtotals))) totals += subtotals options.stdout.write("%s\t%s\n" % ("total", str(totals))) elif options.mode == "node": chunks_per_node = {} rx_node = re.compile("# job started at .* \d+ on (\S+)") for filename in filenames: if filename == "-": infile = sys.stdin elif filename[-3:] == ".gz": infile = gzip.open(filename, "r") else: infile = open(filename, "r") data = Logfile.LogFileDataLines() for line in infile: if rx_node.match(line): node_id = rx_node.match(line).groups()[0] data = Logfile.LogFileDataLines() if node_id not in chunks_per_node: chunks_per_node[node_id] = [] chunks_per_node[node_id].append(data) continue data.add(line) options.stdout.write("node\t%s\n" % data.getHeader()) total = Logfile.LogFileDataLines() for node, data in sorted(chunks_per_node.items()): subtotal = Logfile.LogFileDataLines() for d in data: # options.stdout.write( "%s\t%s\n" % (node, str(d) ) ) subtotal += d options.stdout.write("%s\t%s\n" % (node, str(subtotal))) total += subtotal options.stdout.write("%s\t%s\n" % ("total", str(total))) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: split_fasta.py 1714 2007-12-11 16:51:12Z andreas $" ) parser.add_option("-f", "--file", dest="input_filename", type="string", help="input filename. If not given, stdin is used.", metavar="FILE") parser.add_option( "-i", "--input-pattern", dest="input_pattern", type="string", help="input pattern. Parses description line in order to extract id.") parser.add_option( "-o", "--output-filename-pattern", dest="output_pattern", type="string", help="output pattern. Gives filename for a given sequence.") parser.add_option( "-n", "--num-sequences", dest="num_sequences", type="int", help="split by number of sequences (not implemented yet).") parser.add_option("-m", "--map", dest="map_filename", type="string", help="map filename. Map identifiers to filenames", metavar="FILE") parser.add_option("-s", "--skip-identifiers", dest="skip_identifiers", action="store_true", help="do not write identifiers.", metavar="FILE") parser.add_option("--min-size", dest="min_size", type="int", help="minimum cluster size.") parser.set_defaults(input_filename=None, map_filename=None, skip_identifiers=False, input_pattern="^(\S+)", min_size=0, num_sequences=None, output_pattern="%s") (options, args) = E.start(parser) if options.input_filename: infile = IOTools.open_file(options.input_filename, "r") else: infile = sys.stdin if options.map_filename: map_id2filename = IOTools.ReadMap(open(options.map_filename, "r")) else: map_id2filename = {} if options.num_sequences: files = FilesChunks(chunk_size=options.num_sequences, output_pattern=options.output_pattern, skip_identifiers=options.skip_identifiers) else: files = Files(output_pattern=options.output_pattern, skip_identifiers=options.skip_identifiers) if options.input_pattern: rx = re.compile(options.input_pattern) else: rx = None ninput = 0 noutput = 0 identifier = None chunk = 0 for seq in FastaIterator.iterate(infile): ninput += 1 if rx: try: identifier = rx.search(seq.title).groups()[0] except AttributeError: print("# parsing error in description line %s" % (seq.title)) else: identifier = seq.title if map_id2filename: if identifier in map_id2filename: identifier = map_id2filename[identifier] else: continue files.Write(identifier, seq) noutput += 1 if options.input_filename: infile.close() # delete all clusters below a minimum size # Note: this has to be done at the end, because # clusters sizes are only available once both the fasta # file and the map has been parsed. if options.min_size: ndeleted = files.DeleteFiles(min_size=options.min_size) else: ndeleted = 0 if options.loglevel >= 1: print("# input=%i, output=%i, ndeleted=%i" % (ninput, noutput, ndeleted)) E.stop()
def main(argv=None): if not argv: argv = sys.argv parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-e", "--output-equivalent", dest="write_equivalent", action="store_true", help="write equivalent entries [default=%default].") parser.add_option("-f", "--output-full", dest="write_full", action="store_true", help="write full gff entries [default=%default].") parser.add_option("-p", "--add-percent", dest="add_percent", action="store_true", help="add percentage columns [default=%default].") parser.add_option("-s", "--ignore-strand", dest="ignore_strand", action="store_true", help="ignore strand information [default=%default].") parser.set_defaults( write_equivalent=False, write_full=False, add_percent=False, ignore_strand=False, as_gtf=False, ) (options, args) = E.start(parser, argv, add_output_options=True) if len(args) != 2: raise ValueError("two arguments required") input_filename1, input_filename2 = args # duplicated features cause a problem. Make sure # features are non-overlapping by running # gff_combine.py on GFF files first. E.info("reading data started") idx, genes2 = {}, set() for e in GTF.readFromFile(IOTools.open_file(input_filename2, "r")): genes2.add(e.gene_id) if e.contig not in idx: idx[e.contig] = bx.intervals.intersection.Intersecter() idx[e.contig].add_interval( bx.intervals.Interval(e.start, e.end, value=e)) overlaps_genes = [] E.info("reading data finished: %i contigs" % len(idx)) # outfile_diff and outfile_overlap not implemented # outfile_diff = getFile( options, "diff" ) # outfile_overlap = getFile( options, "overlap" ) overlapping_genes = set() genes1 = set() # iterate over exons with IOTools.open_file(input_filename1, "r") as infile: for this in GTF.iterator(infile): genes1.add(this.gene_id) try: intervals = idx[this.contig].find(this.start, this.end) except KeyError: continue others = [x.value for x in intervals] for other in others: overlapping_genes.add((this.gene_id, other.gene_id)) # check for identical/half-identical matches output = None for other in others: if this.start == other.start and this.end == other.end: output, symbol = other, "=" break else: for other in others: if this.start == other.start or this.end == other.end: output, symbol = other, "|" break else: symbol = "~" # if outfile_diff != options.stdout: outfile_diff.close() # if outfile_overlap != options.stdout: outfile_overlap.close() outfile = None ################################################################## ################################################################## ################################################################## # print gene based information ################################################################## if overlapping_genes: outfile = getFile(options, "genes_ovl") outfile.write("gene_id1\tgene_id2\n") for a, b in sorted(overlapping_genes): outfile.write("%s\t%s\n" % (a, b)) if outfile != options.stdout: outfile.close() outfile_total = getFile(options, "genes_total") outfile_total.write( "set\tngenes\tnoverlapping\tpoverlapping\tnunique\tpunique\n") outfile = getFile(options, "genes_uniq1") b = set([x[0] for x in overlapping_genes]) d = genes1.difference(b) outfile.write("gene_id1\n") outfile.write("\n".join(sorted(d)) + "\n") if outfile != options.stdout: outfile.close() outfile_total.write( "%s\t%i\t%i\t%5.2f\t%i\t%5.2f\n" % (os.path.basename(input_filename1), len(genes1), len(b), 100.0 * len(b) / len(a), len(d), 100.0 * len(d) / len(genes1))) outfile = getFile(options, "genes_uniq2") b = set([x[1] for x in overlapping_genes]) d = genes2.difference(b) outfile.write("gene_id2\n") outfile.write("\n".join(sorted(d)) + "\n") if outfile != options.stdout: outfile.close() outfile_total.write( "%s\t%i\t%i\t%5.2f\t%i\t%5.2f\n" % (os.path.basename(input_filename2), len(genes2), len(b), 100.0 * len(b) / len(a), len(d), 100.0 * len(d) / len(genes2))) if outfile_total != options.stdout: outfile_total.close() E.stop()