def getRepeatDataFromUCSC(dbhandle, repclasses, outfile, remove_contigs_regex=None, job_memory="4G"): '''download data from UCSC database and write to `outfile` in :term:`gff` format. This method downloads repeats from the repeatmasker track at the UCSC. Arguments --------- dbhandle : object Database handle to UCSC mysql database repclasses : list List of repeat classes to select. If empty, all repeat classes will be collected. outfile : string Filename of output file in :term:`gff` format. remove_contigs_regex : string If given, remove repeats on contigs matching the regular expression given. ''' cc = dbhandle.execute("SHOW TABLES LIKE '%%rmsk'") tables = [x[0] for x in cc.fetchall()] if len(tables) == 0: raise ValueError("could not find any `rmsk` tables") # now collect repeats tmpfile = P.get_temp_file(".") for table in tables: sql = """SELECT genoName, 'repeat', 'exon', genoStart+1, genoEnd, '.', strand, '.', CONCAT('class \\"', repClass, '\\"; family \\"', repFamily, '\\"; repName \\"', repName, '\\";') FROM %(table)s""" if repclasses: repclasses_str = ",".join( ["'" + x.strip() + "'" for x in repclasses]) sql += ''' WHERE repClass in (%(repclasses_str)s) ''' % locals() sql = sql % locals() E.debug("executing sql statement: %s" % sql) cc = dbhandle.execute(sql) for data in cc.fetchall(): tmpfile.write("\t".join(map(str, data)) + "\n") tmpfile.close() # sort gff and make sure that names are correct tmpfilename = tmpfile.name statement = [ '''cat %(tmpfilename)s | sort -t$'\\t' -k1,1 -k4,4n | cgat gff2gff --method=sanitize --sanitize-method=genome --skip-missing --genome-file=%(genome_dir)s/%(genome)s --log=%(outfile)s.log ''' ] if remove_contigs_regex: statement.append('--contig-pattern="{}"'.format( ",".join(remove_contigs_regex))) statement.append('''| gzip > %(outfile)s ''') statement = " ".join(statement) P.run(statement, job_memory=job_memory) os.unlink(tmpfilename)
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-i", "--input-vcf", dest="input_vcf_file", type="string", help="input vcf file") parser.add_option("-t", "--truth-vcf", dest="truth_vcf_file", type="string", help="truth vcf file") parser.add_option( "-f", "--input-fasta", dest="input_fasta_file", type="string", help="input fasta file. faidx indexed reference sequence file to " "determine INDEL context [%default]") parser.add_option( "-e", "--input-bed", dest="input_bed_file", type="string", help="input file with intervals. Tab-delimited file of intervals " "in bed format to restrict analysis to. [%default]") parser.add_option("-m", "--method", dest="methods", action="append", type="choice", choices=("mutational-signature", "kinship"), help="methods to apply [%default]") parser.set_defaults( methods=[], input_vcf_file=None, input_bed_file=None, input_fasta_file=None, truth_vcf_file=None, ) (options, args) = E.start(parser, argv, add_output_options=True) if len(args) == 1: options.input_vcf_file = args[0] if options.input_vcf_file is None: raise ValueError("please supply a VCF file") if options.truth_vcf_file is None: raise ValueError("please supply a VCF file with truth data") if options.input_fasta_file is None: raise ValueError( "please supply a fasta file with the reference genome") if not os.path.exists(options.input_vcf_file): raise OSError("input vcf file {} does not exist".format( options.input_vcf_file)) if not os.path.exists(options.input_vcf_file + ".tbi"): raise OSError("input vcf file {} needs to be indexed".format( options.input_vcf_file)) if not os.path.exists(options.truth_vcf_file): raise OSError("truth vcf file {} does not exist".format( options.truth_vcf_file)) if not os.path.exists(options.truth_vcf_file + ".tbi"): raise OSError("truth vcf file {} needs to be indexed".format( options.truth_vcf_file)) if not os.path.exists(options.input_fasta_file): raise OSError("input fasta file {} does not exist".format( options.input_fasta_file)) if not os.path.exists(options.input_fasta_file + ".fai"): raise OSError("input fasta file {} needs to be indexed".format( options.input_fasta_file)) # update paths to absolute options.input_fasta_file = os.path.abspath(options.input_fasta_file) options.input_vcf_file = os.path.abspath(options.input_vcf_file) options.truth_vcf_file = os.path.abspath(options.truth_vcf_file) test_vcf = pysam.VariantFile(options.input_vcf_file) truth_vcf = pysam.VariantFile(options.truth_vcf_file) contigs = test_vcf.header.contigs truth_contigs = set(truth_vcf.header.contigs) test_vcf_samples = set(test_vcf.header.samples) truth_vcf_samples = set(truth_vcf.header.samples) common_samples = test_vcf_samples.intersection(truth_vcf_samples) if len(common_samples) == 0: raise ValueError("no common samples in test/truth VCFs") def pair_iterator(test_vcf, truth_vcf, contig): counter = E.Counter() test_iter = test_vcf.fetch(contig) truth_iter = truth_vcf.fetch(contig) test_record = next(test_iter) truth_record = next(truth_iter) try: while 1: if test_record.pos < truth_record.pos: test_record = next(test_iter) continue elif test_record.pos > truth_record.pos: truth_record = next(truth_iter) continue elif len(test_record.alts) > 1: counter.skip_test_truth += 1 test_record = next(test_iter) continue elif len(truth_record.alts) > 1: counter.skip_multiallelic_truth += 1 truth_record = next(truth_iter) continue elif test_record.alts != truth_record.alts: counter.skip_genotype_difference += 1 test_record = next(test_iter) truth_record = next(truth_iter) continue if test_record.ref != truth_record.ref: # todo: deal with indels raise ValueError("mismatching reference bases at position " "{}:{}".format(test_record.chrom, test_record.pos)) yield test_record, truth_record test_record = next(test_iter) truth_record = next(truth_iter) except StopIteration: pass E.debug(str(counter)) counters_per_contig = {} for contig in contigs: counter_contig = collections.defaultdict(E.Counter) counters_per_contig[contig] = counter_contig E.info("processing contig {}".format(contig)) if contig not in truth_contigs: E.warn( "skipping contig {} as it is not in truth data".format(contig)) continue switch = False last_is_unphased = True for test_record, truth_record in pair_iterator(test_vcf, truth_vcf, contig): for sample in common_samples: counter = counter_contig[sample] truth_phased = truth_record.samples[sample].phased test_phased = test_record.samples[sample].phased truth_genotype = truth_record.samples[sample]["GT"] test_genotype = test_record.samples[sample]["GT"] truth_alleles = set(truth_genotype) test_alleles = set(test_genotype) ignore = False if not truth_phased: counter.truth_unphased += 1 ignore = True if not test_phased: counter.test_unphased += 1 ignore = True last_is_unphased = True else: last_is_unphased = False if len(test_alleles) == 1: counter.test_homozygous += 1 ignore = True else: if not test_phased: counter.test_unphased_hets += 1 if len(truth_alleles) == 1: counter.truth_homozygous += 1 ignore = True if ignore: counter.ignore += 1 continue E.debug("comparing: {}:{} {} -> {}: {} {}".format( test_record.chrom, test_record.pos, test_record.ref, test_record.alts, test_genotype, truth_genotype)) if switch: truth_genotype = truth_genotype[::-1] counter.test_phased_hets += 1 if truth_genotype != test_genotype: if not last_is_unphased: E.debug("SWITCH: {}".format(switch)) counter.switch += 1 switch = not switch outf = options.stdout outf.write("\t".join( ("contig", "sample", "switch_error_percent", "false_negative_rate", "switches", "test_phased_hets", "test_unphased_hets", "test_unphased", "truth_unphased", "test_homozygous", "truth_homozygous")) + "\n") for contig, contig_dict in list(counters_per_contig.items()): for sample, c in list(contig_dict.items()): outf.write("\t".join( map(str, (contig, sample, "{:6.4f}".format(100.0 * c.switch / (c.test_phased_hets + 1)), "{:6.4f}" .format(100.0 * c.test_unphased_hets / (c.test_phased_hets + c.test_unphased_hets)), c.switch, c.test_phased_hets, c.test_unphased_hets, c.test_unphased, c.truth_unphased, c.test_homozygous, c.truth_homozygous))) + "\n") E.stop()
def join_tables(outfile, options, args): '''join tables.''' if options.headers and options.headers[0] != "auto" and \ len(options.headers) != len(options.filenames): raise ValueError("number of provided headers (%i) " "is not equal to number filenames (%i)." % (len(options.headers), len(options.filenames))) tables = [] keys = {} sorted_keys = [] sizes = {} if options.merge: titles = ["count"] else: titles = [] headers_to_delete = [] if options.prefixes: prefixes = [x.strip() for x in options.prefixes.split(",")] if len(prefixes) != len(options.filenames): raise ValueError( ("number of prefixes (%i) and tables (%i) " "do not match") % (len(prefixes), len(options.filenames))) else: prefixes = None E.debug("joining on columns %s and taking columns %s" % (options.columns, options.take)) for nindex, filename in enumerate(options.filenames): E.info("processing %s (%i/%i)" % (filename, nindex + 1, len(options.filenames))) prefix = os.path.basename(filename) lines = read_table(filename, options) try: # check if the table is empty data = next(lines).split() except StopIteration: # an empty table will raise a StopIteration # skip (or not skip) empty tables if options.ignore_empty: E.warn("%s is empty - skipped" % filename) headers_to_delete.append(nindex) continue table = {} sizes = {} max_size = 0 ncolumns = 0 if options.input_has_titles: # See https://github.com/cgat-developers/cgat-core/pull/53 # data = next(lines).split() # no titles have been defined so far if not titles: key = "-".join([data[x] for x in options.columns]) titles = [key] # set take based on column titles or numerically if options.take: take = [] # convert numeric columns for filtering for x in options.take: try: take.append(int(x) - 1) except ValueError: # will raise error if x is not present take.append(data.index(x)) else: # tables with max 100 columns take = None for x in range(len(data)): if x in options.columns or (take and x not in take): continue ncolumns += 1 if options.add_file_prefix: try: p = re.search(options.regex_filename, prefix).groups()[0] except AttributeError: E.warn("can't extract title from filename %s" % prefix) p = "unknown" titles.append("%s_%s" % (p, data[x])) elif options.use_file_prefix: try: p = re.search(options.regex_filename, prefix).groups()[0] except: E.warn("can't extract title from filename %s" % prefix) p = "unknown" titles.append("%s" % p) elif prefixes: titles.append("%s_%s" % (prefixes[nindex], data[x])) else: titles.append(data[x]) else: # set take based on numeric columns if no titles are present if options.take: take = [] # convert numeric columns for filtering for x in options.take: take.append(int(x) - 1) else: # tables with max 100 columns take = None # IMS: We might still want filename titles even if the input # columns don't have titles. if options.add_file_prefix: if not titles: titles = ["ID"] try: p = re.search(options.regex_filename, prefix).groups()[0] except AttributeError: E.warn("can't extract title from filename %s" % prefix) p = "unknown" titles.append("%s_%s" % (p, data[x])) elif options.use_file_prefix: if not titles: titles = ["ID"] try: p = re.search(options.regex_filename, prefix).groups()[0] except: E.warn("can't extract title from filename %s" % prefix) p = "unknown" titles.append("%s" % p) ncolumns = 1 n = 0 for line in lines: data = line[:-1].split("\t") try: row_keys = [data[x] for x in options.columns] except IndexError as msg: raise IndexError("error while parsing %s: %s" % (filename, msg)) if options.sort_keys: if options.sort_keys == "numeric": row_keys.sort(lambda x, y: cmp(float(x), float(y))) else: row_keys.sort() if options.merge: key = n else: key = "-".join(row_keys) if key not in keys: sorted_keys.append(key) keys[key] = 1 sizes[key] = 0 if take: max_size = len(take) table[key] = [data[x] for x in take] else: max_size = max(len(data) - len(options.columns), max_size) table[key] = [ data[x] for x in range(0, len(data)) if x not in options.columns ] n += 1 # enter columns of "na" for empty tables. if max_size == 0: max_size = ncolumns tables.append((max_size, table)) # delete in reverse order if options.headers: for nindex in headers_to_delete[::-1]: del options.headers[nindex] if len(tables) == len(titles) - 1: if options.headers: headers = ["bin"] if options.headers[0] == 'auto': for t in range(len(tables)): headers.append(os.path.basename(options.filenames[t])) headers += [""] * (tables[t][0] - 1) else: for t in range(len(tables)): headers.append(options.headers[t]) headers += [""] * (tables[t][0] - 1) # use headers as titles, if headers is given and skip-titles is # turned on if options.input_has_titles and options.skip_titles: titles = headers else: # otherwise: print the headers out right away outfile.write("\t".join(headers) + "\n") order = list(range(0, len(tables) + 1)) if options.input_has_titles or \ (options.use_file_prefix or options.add_file_prefix): if options.sort: sort_order = [] if options.sort == "numeric": t = list( zip(list(map(int, titles[1:])), list(range(1, len(titles) + 1)))) t.sort() for tt in t: sort_order.append(titles[tt[1]]) elif options.sort == "alphabetical": t = list(zip(titles[1:], list(range(1, len(titles) + 1)))) t.sort() for tt in t: sort_order.append(titles[tt[1]]) else: sort_order = options.sort map_title2pos = {} for x in range(1, len(titles)): map_title2pos[titles[x]] = x order = [ 0, ] for x in sort_order: if x in map_title2pos: order.append(map_title2pos[x]) else: order = list(range(0, len(titles))) outfile.write("\t".join( [titles[order[x]] for x in range(len(titles))])) outfile.write("\n") if options.sort_keys: if options.sort_keys: if options.sort_keys == "numeric": sorted_keys.sort(lambda x, y: cmp(float(x), float(y))) else: sorted_keys.sort() for key in sorted_keys: outfile.write("%s" % key) for x in order[1:]: max_size, table = tables[x - 1] c = 0 if key in table: outfile.write("\t") outfile.write("\t".join(table[key])) c = len(table[key]) assert (max_size == 1) outfile.write("\t%s" % options.missing_value * (max_size - c)) outfile.write("\n") else: # for multi-column table, just write if options.input_has_titles: outfile.write("\t".join([titles[x] for x in range(len(titles))])) outfile.write("\n") for key in sorted_keys: outfile.write("%s" % key) for x in range(len(tables)): max_size, table = tables[x] c = 0 if key in table: outfile.write("\t") outfile.write("\t".join(table[key])) c = len(table[key]) outfile.write("\t%s" % options.missing_value * (max_size - c)) outfile.write("\n")
def main(argv=sys.argv): parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-d", "--data-directory", dest="data_directory", type="string", help="directory in which to create links. " "[%default]") parser.add_option( "-f", "--force-first", dest="force_first", action="store_true", help="force running of pipeline in first instance [%default]") parser.add_option( "-k", "--keep-level", dest="keep_level", type="int", help="level to keep for the directories that are being watched [%default]") parser.add_option( "-c", "--command", dest="command", type="string", help="command to run when new data appears [%default]") parser.set_defaults( data_directory="data", last_update=3600, input_fastq_file=None, keep_level=3, force_first=False, sleep=60, command="daisy run -v 5 -p 100 make all", ) (options, args) = E.start(parser, argv) if not os.path.exists("benchmark.yml"): raise ValueError("config file {} does not exist".format( "benchmark.yml")) with IOTools.open_file("benchmark.yml") as inf: config = yaml.load(inf, Loader=yaml.FullLoader) if "watch" not in config: raise ValueError("config file needs to contain a 'watch' section") if isinstance(config["watch"], list): watchlist = config["watch"] else: watchlist = [config["watch"]] E.info("watching with {} glob expressions".format(len(watchlist))) while 1: current_time = time.time() c = E.Counter() iteration = 1 for glob_expr in watchlist: filenames = glob.glob(glob_expr) E.debug("found {} files for {}".format(len(filenames), glob_expr)) for fn in filenames: c.found += 1 parts = os.path.abspath(fn).split(os.sep) dest_fn = os.path.abspath( os.path.join( options.data_directory, os.sep.join(parts[-options.keep_level:]))) dirname = os.path.dirname(dest_fn) if not os.path.exists(dirname): E.info("creating new directory {}".format(dirname)) os.makedirs(dirname) if not os.path.exists(dest_fn): modification_time = os.path.getmtime(fn) timedelta = current_time - modification_time if timedelta > options.last_update: E.info("new file detected, creating link: {}".format(dest_fn)) c.new_file_create += 1 os.symlink(os.path.abspath(fn), dest_fn) else: E.info( "new file detected, but too recent ({}s): {}".format( timedelta, dest_fn)) c.new_file_wait += 1 else: c.existing += 1 E.info("iteration {}: {}".format(iteration, str(c))) if iteration == 1 and options.force_first: E.run(options.command) elif c.new_file_create == 0: E.info("found no new files, waiting for {} seconds".format(options.sleep)) time.sleep(options.sleep) else: E.run(options.command) iteration += 1
def annotateGenome(iterator, fasta, options, default_code=DEFAULT_CODE): """annotate a genome given by the indexed *fasta* file and an iterator over gtf annotations. """ annotations = {} contig_sizes = fasta.getContigSizes(with_synonyms=False) E.info("allocating memory for %i contigs and %i bytes" % (len(contig_sizes), sum(contig_sizes.values()) * array.array("B").itemsize)) # AString.AString( "a").itemsize )) for contig, size in list(contig_sizes.items()): E.debug("allocating %s: %i bases" % (contig, size)) # annotations[contig] = AString.AString( default_code * size ) # annotations[contig] = array.array("", default_code * size) # Go to list for py3 compatibility, patch annotations[contig] = [default_code] * size E.info("allocated memory for %i contigs" % len(fasta)) counter = E.Counter() # output splice junctions outfile_junctions = E.open_output_file("junctions") outfile_junctions.write( "contig\tstrand\tpos1\tpos2\tframe\tgene_id\ttranscript_id\n") for gtfs in iterator: counter.input += 1 if counter.input % options.report_step == 0: E.info("iteration %i" % counter.input) try: contig = fasta.getToken(gtfs[0].contig) except KeyError as msg: E.warn("contig %s not found - annotation ignored" % gtfs[0].contig) counter.skipped_contig += 1 continue lcontig = fasta.getLength(contig) # make sure that exons are sorted by coordinate gtfs.sort(key=lambda x: x.start) is_positive = Genomics.IsPositiveStrand(gtfs[0].strand) source = gtfs[0].source # process non-coding data if source in MAP_ENSEMBL: code = MAP_ENSEMBL[source] intervals = [(x.start, x.end) for x in gtfs] addSegments(annotations[contig], intervals, is_positive, code) elif source == "protein_coding": # collect exons for utr exons = [(x.start, x.end) for x in gtfs if x.feature == "exon"] cds = [(x.start, x.end) for x in gtfs if x.feature == "CDS"] if len(cds) == 0: counter.skipped_transcripts += 1 E.warn("protein-coding transcript %s without CDS - skipped" % gtfs[0].transcript_id) continue exons = Intervals.truncate(exons, cds) start, end = cds[0][0], cds[-1][1] UTR5 = [x for x in exons if x[1] < start] UTR3 = [x for x in exons if x[0] >= end] if not is_positive: UTR5, UTR3 = UTR3, UTR5 splice_code = "S" else: splice_code = "s" addSegments(annotations[contig], UTR5, is_positive, "u") addIntrons(annotations[contig], UTR5, is_positive, options.max_frameshift_length) addSegments(annotations[contig], UTR3, is_positive, "v") addIntrons(annotations[contig], UTR3, is_positive, options.max_frameshift_length) # output CDS according to frame addCDS(annotations[contig], [x for x in gtfs if x.feature == "CDS"], is_positive) # add introns between CDS addIntrons(annotations[contig], cds, is_positive, options.max_frameshift_length) # output splice junctions cds = [x for x in gtfs if x.feature == "CDS"] # apply corrections for 1-past end coordinates # to point between residues within CDS if is_positive: ender = lambda x: x.end - 1 starter = lambda x: x.start out_positive = "+" else: ender = lambda x: lcontig - x.start - 1 starter = lambda x: lcontig - x.end out_positive = "-" cds.reverse() end = ender(cds[0]) for c in cds[1:]: start = starter(c) outfile_junctions.write("%s\t%s\t%i\t%i\t%s\t%s\t%s\n" % ( contig, out_positive, end, start, c.frame, c.gene_id, c.transcript_id, )) end = ender(c) E.info("finished reading genes: %s" % str(counter)) outfile_junctions.close() E.info("started counting") outfile = E.open_output_file("counts") outputCounts(outfile, annotations) outfile.close() E.info("started output") for k in sorted(annotations.keys()): # options.stdout.write(">%s\n%s\n" % (k, annotations[k].tostring())) options.stdout.write(">%s\n%s\n" % (k, "".join(annotations[k])))
def main(argv=None): if argv is None: argv = sys.argv parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument( "-m", "--method", dest="method", type=str, choices=("add-flank", "add-upstream-flank", "add-downstream-flank", "crop", "crop-unique", "complement-groups", "combine-groups", "filter-range", "join-features", "merge-features", "sanitize", "to-forward-coordinates", "to-forward-strand", "rename-chr"), help="method to apply ") parser.add_argument("--ignore-strand", dest="ignore_strand", help="ignore strand information.", action="store_true") parser.add_argument("--is-gtf", dest="is_gtf", action="store_true", help="input will be treated as gtf.") parser.add_argument("-c", "--contigs-tsv-file", dest="input_filename_contigs", type=str, help="filename with contig lengths.") parser.add_argument( "--agp-file", dest="input_filename_agp", type=str, help="agp file to map coordinates from contigs to scaffolds.") parser.add_argument("-g", "--genome-file", dest="genome_file", type=str, help="filename with genome.") parser.add_argument("--crop-gff-file", dest="filename_crop_gff", type=str, help="GFF/GTF file to crop against.") parser.add_argument( "--group-field", dest="group_field", type=str, help="""gff field/attribute to group by such as gene_id, " "transcript_id, ... .""") parser.add_argument( "--filter-range", dest="filter_range", type=str, help="extract all elements overlapping a range. A range is " "specified by eithor 'contig:from..to', 'contig:+:from..to', " "or 'from,to' .") parser.add_argument("--sanitize-method", dest="sanitize_method", type=str, choices=("ucsc", "ensembl", "genome"), help="method to use for sanitizing chromosome names. " ".") parser.add_argument( "--flank-method", dest="flank_method", type=str, choices=("add", "extend"), help="method to use for adding flanks. ``extend`` will " "extend existing features, while ``add`` will add new features. " ".") parser.add_argument("--skip-missing", dest="skip_missing", action="store_true", help="skip entries on missing contigs. Otherwise an " "exception is raised .") parser.add_argument( "--contig-pattern", dest="contig_pattern", type=str, help="a comma separated list of regular expressions specifying " "contigs to be removed when running method sanitize .") parser.add_argument( "--assembly-report", dest="assembly_report", type=str, help="path to assembly report file which allows mapping of " "ensembl to ucsc contigs when running method sanitize .") parser.add_argument( "--assembly-report-hasids", dest="assembly_report_hasIDs", type=int, help="path to assembly report file which allows mapping of " "ensembl to ucsc contigs when running method sanitize .") parser.add_argument( "--assembly-report-ucsccol", dest="assembly_report_ucsccol", type=int, help="column in the assembly report containing ucsc contig ids" ".") parser.add_argument( "--assembly-report-ensemblcol", dest="assembly_report_ensemblcol", type=int, help="column in the assembly report containing ensembl contig ids") parser.add_argument( "--assembly-extras", dest="assembly_extras", type=str, help="additional mismatches between gtf and fasta to fix when" "sanitizing the genome .") parser.add_argument("--extension-upstream", dest="extension_upstream", type=float, help="extension for upstream end .") parser.add_argument("--extension-downstream", dest="extension_downstream", type=float, help="extension for downstream end .") parser.add_argument("--min-distance", dest="min_distance", type=int, help="minimum distance of features to merge/join .") parser.add_argument("--max-distance", dest="max_distance", type=int, help="maximum distance of features to merge/join .") parser.add_argument("--min-features", dest="min_features", type=int, help="minimum number of features to merge/join .") parser.add_argument("--max-features", dest="max_features", type=int, help="maximum number of features to merge/join .") parser.add_argument( "--rename-chr-file", dest="rename_chr_file", type=str, help="mapping table between old and new chromosome names." "TAB separated 2-column file.") parser.set_defaults(input_filename_contigs=False, filename_crop_gff=None, input_filename_agp=False, genome_file=None, rename_chr_file=None, add_up_flank=None, add_down_flank=None, complement_groups=False, crop=None, crop_unique=False, ignore_strand=False, filter_range=None, min_distance=0, max_distance=0, min_features=1, max_features=0, extension_upstream=1000, extension_downstream=1000, sanitize_method="ucsc", flank_method="add", output_format="%06i", skip_missing=False, is_gtf=False, group_field=None, contig_pattern=None, assembly_report=None, assembly_report_hasIDs=1, assembly_report_ensemblcol=4, assembly_report_ucsccol=9, assembly_extras=None) (args) = E.start(parser, argv=argv) contigs = None genome_fasta = None chr_map = None if args.input_filename_contigs: contigs = Genomics.readContigSizes( iotools.open_file(args.input_filename_contigs, "r")) if args.genome_file: genome_fasta = IndexedFasta.IndexedFasta(args.genome_file) contigs = genome_fasta.getContigSizes() if args.rename_chr_file: chr_map = {} with open(args.rename_chr_file, 'r') as filein: reader = csv.reader(filein, delimiter='\t') for row in reader: if len(row) != 2: raise ValueError( "Mapping table must have exactly two columns") chr_map[row[0]] = row[1] if not len(chr_map.keys()) > 0: raise ValueError("Empty mapping dictionnary") if args.assembly_report: df = pd.read_csv(args.assembly_report, comment="#", header=None, sep="\t") # fixes naming inconsistency in assembly report: ensembl chromosome # contigs found in columnn 0, ensembl unassigned contigs found in # column 4. if args.assembly_report_hasIDs == 1: ucsccol = args.assembly_report_ucsccol ensemblcol = args.assembly_report_ensemblcol df.loc[df[1] == "assembled-molecule", ensemblcol] = df.loc[df[1] == "assembled-molecule", 0] if args.sanitize_method == "ucsc": assembly_dict = df.set_index(ensemblcol)[ucsccol].to_dict() elif args.sanitize_method == "ensembl": assembly_dict = df.set_index(ucsccol)[ensemblcol].to_dict() else: raise ValueError(''' When using assembly report, please specify sanitize method as either "ucsc" or "ensembl" to specify direction of conversion ''') else: assembly_dict = {} if args.assembly_extras is not None: assembly_extras = args.assembly_extras.split(",") for item in assembly_extras: item = item.split("-") assembly_dict[item[0]] = item[1] if args.method in ("forward_coordinates", "forward_strand", "add-flank", "add-upstream-flank", "add-downstream-flank") \ and not contigs: raise ValueError("inverting coordinates requires genome file") if args.input_filename_agp: agp = AGP.AGP() agp.readFromFile(iotools.open_file(args.input_filename_agp, "r")) else: agp = None gffs = GTF.iterator(args.stdin) if args.method in ("add-upstream-flank", "add-downstream-flank", "add-flank"): add_upstream_flank = "add-upstream-flank" == args.method add_downstream_flank = "add-downstream-flank" == args.method if args.method == "add-flank": add_upstream_flank = add_downstream_flank = True upstream_flank = int(args.extension_upstream) downstream_flank = int(args.extension_downstream) extend_flank = args.flank_method == "extend" if args.is_gtf: iterator = GTF.flat_gene_iterator(gffs) else: iterator = GTF.joined_iterator(gffs, args.group_field) for chunk in iterator: is_positive = Genomics.IsPositiveStrand(chunk[0].strand) chunk.sort(key=lambda x: (x.contig, x.start)) lcontig = contigs[chunk[0].contig] if extend_flank: if add_upstream_flank: if is_positive: chunk[0].start = max(0, chunk[0].start - upstream_flank) else: chunk[-1].end = min(lcontig, chunk[-1].end + upstream_flank) if add_downstream_flank: if is_positive: chunk[-1].end = min(lcontig, chunk[-1].end + downstream_flank) else: chunk[0].start = max(0, chunk[0].start - downstream_flank) else: if add_upstream_flank: gff = GTF.Entry() if is_positive: gff.copy(chunk[0]) gff.end = gff.start gff.start = max(0, gff.start - upstream_flank) chunk.insert(0, gff) else: gff.copy(chunk[-1]) gff.start = gff.end gff.end = min(lcontig, gff.end + upstream_flank) chunk.append(gff) gff.feature = "5-Flank" gff.mMethod = "gff2gff" if add_downstream_flank: gff = GTF.Entry() if is_positive: gff.copy(chunk[-1]) gff.start = gff.end gff.end = min(lcontig, gff.end + downstream_flank) chunk.append(gff) else: gff.copy(chunk[0]) gff.end = gff.start gff.start = max(0, gff.start - downstream_flank) chunk.insert(0, gff) gff.feature = "3-Flank" gff.mMethod = "gff2gff" if not is_positive: chunk.reverse() for gff in chunk: args.stdout.write(str(gff) + "\n") elif args.method == "complement-groups": iterator = GTF.joined_iterator(gffs, group_field=args.group_field) for chunk in iterator: if args.is_gtf: chunk = [x for x in chunk if x.feature == "exon"] if len(chunk) == 0: continue chunk.sort(key=lambda x: (x.contig, x.start)) x = GTF.Entry() x.copy(chunk[0]) x.start = x.end x.feature = "intron" for c in chunk[1:]: x.end = c.start args.stdout.write(str(x) + "\n") x.start = c.end elif args.method == "combine-groups": iterator = GTF.joined_iterator(gffs, group_field=args.group_field) for chunk in iterator: chunk.sort(key=lambda x: (x.contig, x.start)) x = GTF.Entry() x.copy(chunk[0]) x.end = chunk[-1].end x.feature = "segment" args.stdout.write(str(x) + "\n") elif args.method == "join-features": for gff in combineGFF(gffs, min_distance=args.min_distance, max_distance=args.max_distance, min_features=args.min_features, max_features=args.max_features, merge=False, output_format=args.output_format): args.stdout.write(str(gff) + "\n") elif args.method == "merge-features": for gff in combineGFF(gffs, min_distance=args.min_distance, max_distance=args.max_distance, min_features=args.min_features, max_features=args.max_features, merge=True, output_format=args.output_format): args.stdout.write(str(gff) + "\n") elif args.method == "crop": for gff in cropGFF(gffs, args.filename_crop_gff): args.stdout.write(str(gff) + "\n") elif args.method == "crop-unique": for gff in cropGFFUnique(gffs): args.stdout.write(str(gff) + "\n") elif args.method == "filter-range": contig, strand, interval = None, None, None try: contig, strand, start, sep, end = re.match( "(\S+):(\S+):(\d+)(\.\.|-)(\d+)", args.filter_range).groups() except AttributeError: pass if not contig: try: contig, start, sep, end = re.match("(\S+):(\d+)(\.\.|-)(\d+)", args.filter_range).groups() strand = None except AttributeError: pass if not contig: try: start, end = re.match("(\d+)(\.\.|\,|\-)(\d+)", args.filter_range).groups() except AttributeError: raise "can not parse range %s" % args.filter_range contig = None strand = None if start: interval = (int(start), int(end)) else: interval = None E.debug("filter: contig=%s, strand=%s, interval=%s" % (str(contig), str(strand), str(interval))) for gff in GTF.iterator_filtered(gffs, contig=contig, strand=strand, interval=interval): args.stdout.write(str(gff) + "\n") elif args.method == "sanitize": def assemblyReport(id): if id in assembly_dict.keys(): id = assembly_dict[id] # if not in dict, the contig name is forced # into the desired convention, this is helpful user # modified gff files that contain additional contigs elif args.sanitize_method == "ucsc": if not id.startswith("contig") and not id.startswith("chr"): id = "chr%s" % id elif args.sanitize_method == "ensembl": if id.startswith("contig"): return id[len("contig"):] elif id.startswith("chr"): return id[len("chr"):] return id if args.sanitize_method == "genome": if genome_fasta is None: raise ValueError("please specify --genome-file= when using " "--sanitize-method=genome") f = genome_fasta.getToken else: if args.assembly_report is None: raise ValueError( "please specify --assembly-report= when using " "--sanitize-method=ucsc or ensembl") f = assemblyReport skipped_contigs = collections.defaultdict(int) outofrange_contigs = collections.defaultdict(int) filtered_contigs = collections.defaultdict(int) for gff in gffs: try: gff.contig = f(gff.contig) except KeyError: if args.skip_missing: skipped_contigs[gff.contig] += 1 continue else: raise if genome_fasta: lcontig = genome_fasta.getLength(gff.contig) if lcontig < gff.end: outofrange_contigs[gff.contig] += 1 continue if args.contig_pattern: to_remove = [ re.compile(x) for x in args.contig_pattern.split(",") ] if any([x.search(gff.contig) for x in to_remove]): filtered_contigs[gff.contig] += 1 continue args.stdout.write(str(gff) + "\n") if skipped_contigs: E.info("skipped %i entries on %i contigs: %s" % (sum(skipped_contigs.values()), len(list(skipped_contigs.keys())), str(skipped_contigs))) if outofrange_contigs: E.warn( "skipped %i entries on %i contigs because they are out of range: %s" % (sum(outofrange_contigs.values()), len(list( outofrange_contigs.keys())), str(outofrange_contigs))) if filtered_contigs: E.info("filtered out %i entries on %i contigs: %s" % (sum(filtered_contigs.values()), len(list(filtered_contigs.keys())), str(filtered_contigs))) elif args.method == "rename-chr": if not chr_map: raise ValueError("please supply mapping file") for gff in renameChromosomes(gffs, chr_map): args.stdout.write(str(gff) + "\n") else: for gff in gffs: if args.method == "forward_coordinates": gff.invert(contigs[gff.contig]) if args.method == "forward_strand": gff.invert(contigs[gff.contig]) gff.strand = "+" if agp: # note: this works only with forward coordinates gff.contig, gff.start, gff.end = agp.mapLocation( gff.contig, gff.start, gff.end) args.stdout.write(str(gff) + "\n") E.stop()
def main(argv=None): parser = E.ArgumentParser(description=__doc__) parser.add_argument( "-s", "--sample-size", dest="sample_size", type=float, help= "sample size. If less than 0, take a proportion of the chromosome size. " "If greater than 0, take a fixed number of variants ") parser.set_defaults(input_filename_fasta=None, sample_size=0.001, sample_name="NA12878") (args) = E.start(parser, argv=argv, add_output_options=True) if len(args) > 0: args.input_filename_fasta = args[0] if args.input_filename_fasta == "-": args.input_filename_fasta = args.stdin outf = args.stdout outf.write("##fileformat=VCFv4.1\n") outf.write( "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n") outf.write( "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\t{}\n".format( args.sample_name)) with pysam.FastxFile(args.input_filename_fasta) as inf: for record in inf: contig = record.name sequence = record.sequence if args.sample_size < 1.0: nsamples = int(float(len(sequence)) * args.sample_size) else: nsamples = int(args.sample_size) E.info("generating {} sampled variants for contig {}".format( nsamples, contig)) sampled_positions = set() missing_nsamples = nsamples while len(sampled_positions) < nsamples: raw_positions = random.sample( list(range(len(sequence))), nsamples - len(sampled_positions)) filtered_positions = [ x for x in raw_positions if sequence[x] != "N" ] sampled_positions.update(filtered_positions) E.debug("sample update: total={}, raw={}, filtered={}".format( len(sampled_positions), len(raw_positions), len(filtered_positions))) sampled_positions = sorted(sampled_positions) for position in sampled_positions: base = sequence[position] outf.write("{}\t{}\t.\t{}\t{}\t.\t.\t.\tGT\t0/0\n".format( contig, position + 1, base, base)) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $1.0$", usage=globals()["__doc__"]) parser.add_option("-r", "--reffile", dest="reffile", type="string", help="Supply reference gtf file name") parser.add_option("-d", "--class-file", dest="classfile", type="string", help="Supply database name") parser.add_option("-o", "--outfile", dest="outfile", type="string", help="Supply output bed file name") parser.add_option("-u", "--indivfile", dest="indivfile", type="string", help="Supply output bed file name for individual utrons") parser.add_option("-p", "--partfile", dest="partfile", type="string", help="Supply output bed file name for partnered utrons") parser.add_option( "-q", "--indivpartfile", dest="indivpartfile", type="string", help="Supply output bed file name for individual partnered utrons") parser.add_option("-n", "--novel-file", dest="novelfile", type="string", help="Supply output bed file name for novel introns") parser.add_option( "--novel-transcript", dest="novel_id", type="string", help="DEBUG: Output info for this transcript from the STDIN") parser.add_option( "--target-transcript", dest="target_id", type="string", help="DEBUG: Output info for this transcript from ref-file") # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) outlines = [] individuals = [] partnered = [] individualpartnered = [] novel = [] db = pandas.read_csv(options.classfile, sep="\t") # This keeps just one entry per-transcript - why? #db = db.groupby("transcript_id").first() db = db.set_index("transcript_id") enshashtable = getGeneTable(options.reffile) for novel_transcript in GTF.transcript_iterator(GTF.iterator( options.stdin)): # Why do it on a gene by gene basis rather than transcript by transcript basis? transcript_id = novel_transcript[0].transcript_id if transcript_id == options.novel_id: output_novel = True else: output_novel = False try: geneid = db.loc[transcript_id].match_gene_id except KeyError: if output_novel: E.debug("Transcript %s not in class table" % transcript_id) continue if pandas.isnull(geneid): if output_novel: E.debug("Transcript %s matches no gene in class table" % transcript_id) continue ens_gene = enshashtable[geneid] all_ref_introns = set() novel_transcript_exons = GTF.asRanges(novel_transcript, "exon") novel_transcript_introns = GTF.toIntronIntervals(novel_transcript) for ref_transcript in ens_gene["models"].values(): ref_introns = GTF.toIntronIntervals(ref_transcript) all_ref_introns.update(ref_introns) #Identify comparison set def _in_exon(position, exons): return any(e[0] <= position <= e[1] for e in exons) # check if this ever gets the wrong start_codon. filtered_starts = [ s for s in ens_gene["start_codons"] if _in_exon(s, novel_transcript_exons) ] if len(filtered_starts) == 0: if output_novel: E.debug("No starts found for %s" % transcript_id) continue #if novel_transcript[0].strand == "-": # selected_start = max(filtered_starts) #else: # selected_start = min(filtered_starts) selected_models = list() for startc in filtered_starts: selected_models.extend(ens_gene["start_codons"][startc]) if output_novel: E.debug("Transcripts with compatible starts are %s" % selected_models) for ref_transcript_id in selected_models: if output_novel and ref_transcript_id == options.target_id: output_ref = True else: output_ref = False second = ens_gene["models"][ref_transcript_id] ens_CDS = GTF.asRanges(second, "CDS") if len(ens_CDS) == 0: if output_ref: E.debug("%s is not coding" ) # ensure only protein-coding transcripts continue ens_exons = GTF.asRanges(second, "exon") first_introns = set(novel_transcript_introns) second_introns = set(GTF.toIntronIntervals(second)) first_CDSintrons = [ intron for intron in first_introns if (intron[0] > ens_CDS[0][0] and intron[1] < ens_CDS[-1][1]) ] second_CDSintrons = [ intron for intron in second_introns if (intron[0] > ens_CDS[0][0] and intron[1] < ens_CDS[-1][1]) ] first_CDSintrons = set(first_CDSintrons) second_CDSintrons = set(second_CDSintrons) if not first_CDSintrons == second_CDSintrons: if output_ref: E.debug("CDS chains do not match. Chains are:") first_CDSintrons = sorted(list(first_CDSintrons)) second_CDSintrons = sorted(list(second_CDSintrons)) output = "\n".join( map(str, zip(first_CDSintrons, second_CDSintrons))) E.debug(output) continue # match CDS intron chain firstUTRintrons = first_introns - first_CDSintrons if len(firstUTRintrons) == 0: if output_ref: E.debug("No UTR introns") continue secondUTRintrons = second_introns - second_CDSintrons found = False for intron in first_introns: if (intron[0] < ens_CDS[-1][1] and intron[1] > ens_CDS[-1][1]) or \ (intron[0] < ens_CDS[0][0] and intron[1] > ens_CDS[0][0]): found = True break # ensure pruned transcript doesn't have # introns overlapping start or stop codons in ensembl # transcript if found: if output_ref: E.debug("Start or stop in intron") continue if second[0].strand == "+": ens_stop = ens_CDS[-1][1] UTR3introns = [ intron for intron in firstUTRintrons if intron[0] >= ens_CDS[-1][1] and intron[1] < ens_exons[-1][1] ] secondUTR3introns = [ intron for intron in secondUTRintrons if intron[0] >= ens_CDS[-1][1] and intron[1] < ens_exons[-1][1] ] else: ens_stop = ens_CDS[0][0] UTR3introns = [ intron for intron in firstUTRintrons if intron[1] <= ens_CDS[0][0] and intron[0] > ens_exons[0][0] ] secondUTR3introns = [ intron for intron in secondUTRintrons if intron[1] <= ens_CDS[0][0] and intron[0] > ens_exons[0][0] ] if len(UTR3introns) == 0: if output_ref: E.debug("No UTR introns") continue outbed = Bed.Bed() outbed.fields = ['.', '.', '.', '.', '.', '.', '.', '.', '.'] outbed.fromIntervals(UTR3introns) outbed.contig = novel_transcript[0].contig outbed["name"] = novel_transcript[0].transcript_id outbed["strand"] = novel_transcript[0].strand outlines.append(outbed) # get output for each transcript for item in UTR3introns: outbed2 = Bed.Bed() outbed2.fields = ['.', '.', '.', '.'] outbed2.fromIntervals([item]) outbed2.contig = novel_transcript[0].contig outbed2['name'] = novel_transcript[0].transcript_id outbed2["strand"] = novel_transcript[0].strand outbed2["thickStart"] = ens_stop individuals.append(outbed2) # get output for each intron UTR3introns = set(UTR3introns) secondUTR3introns = set(secondUTR3introns) extraUTR3introns = list(UTR3introns - secondUTR3introns) if output_ref and len(secondUTR3introns - UTR3introns) > 0: E.debug("Following introns in UTR of %s but not %s" % (options.target_id, options.novel_id)) E.debug(secondUTRintrons - UTR3introns) # get only introns that are not in matched transcript if len(extraUTR3introns) != 0 and len(secondUTR3introns - UTR3introns) == 0: outbed3 = Bed.Bed() outbed3.fields = ['.'] * 9 outbed3.fromIntervals(extraUTR3introns) outbed3.contig = novel_transcript[0].contig outbed3["name"] = novel_transcript[ 0].transcript_id + ":" + second[0].transcript_id outbed3["strand"] = novel_transcript[0].strand partnered.append(outbed3) for item in extraUTR3introns: outbed4 = Bed.Bed() outbed4.fields = ['.', '.', '.', '.'] outbed4.fromIntervals([item]) outbed4.contig = novel_transcript[0].contig outbed4["name"] = novel_transcript[ 0].transcript_id + ":" + second[0].transcript_id outbed4["strand"] = novel_transcript[0].strand outbed4["thickStart"] = ens_stop individualpartnered.append(outbed4) if len(all_ref_introns) == 0: ens_starts, ens_ends = [], [] else: ens_starts, ens_ends = zip(*all_ref_introns) novelEvents = [ i for i in UTR3introns if i[0] not in ens_starts and i[1] not in ens_ends ] for item in novelEvents: outbed5 = Bed.Bed() outbed5.fields = ['.'] * 4 outbed5.fromIntervals([item]) outbed5.contig = novel_transcript[0].contig outbed5["name"] = novel_transcript[ 0].transcript_id + ":" + second[0].transcript_id outbed5["strand"] = novel_transcript[0].strand outbed5["thickStart"] = ens_stop novel.append(outbed5) with IOTools.open_file(options.outfile, "w") as outf: for line in outlines: outf.write(str(line) + "\n") if options.indivfile is not None: with IOTools.open_file(options.indivfile, "w") as outf2: for line in individuals: outf2.write(str(line) + "\n") if options.partfile is not None: with IOTools.open_file(options.partfile, "w") as outf3: for line in partnered: outf3.write(str(line) + "\n") if options.indivpartfile is not None: with IOTools.open_file(options.indivpartfile, "w") as outf4: for line in individualpartnered: outf4.write(str(line) + "\n") if options.novelfile is not None: with IOTools.open_file(options.novelfile, "w") as outf5: for line in novel: outf5.write(str(line) + "\n") # write footer and output benchmark information. E.stop()
def read_liftover_chain(infile): E.debug("started reading mapping information") map_id2chromosome = [ "", ] map_chromosome2id = {} n = 0 Chain = collections.namedtuple("Chain", [ "score", "target_name", "target_size", "target_strand", "target_start", "target_end", "query_name", "query_size", "query_strand", "query_start", "query_end", "chainid" ]) def blocks(infile): keep = False for line in infile: if line.startswith("chain"): chain_data = Chain._make(line[:-1].split(" ")[1:]) if chain_data.target_strand == "-": raise NotImplementedError("target strand is negative") alignment_data = [] elif line.strip() == "": yield chain_data, alignment_data else: alignment_data.append(list(map(int, line.split(("\t"))))) map_chromosomes = collections.defaultdict(quicksect.IntervalTree) map_contig2length = collections.defaultdict(int) for chain_data, alignment_data in blocks(infile): map_contig2length[chain_data.query_name] = int(chain_data.query_size) # target maps to query # coordinates are zero-based, half-open When # the strand value is "-", position coordinates are listed in # terms of the reverse-complemented sequence x = int(chain_data.target_start) y = int(chain_data.query_start) # revert coordinates for negative strands (it seems that # the mapping file uses reverse coordinates, while liftover # output doesn't) invert = chain_data.query_strand == "-" mm = map_chromosomes[chain_data.target_name] for d in alignment_data: if len(d) == 3: size, increment_x, increment_y = d else: size, increment_x, increment_y = d[0], 0, 0 mm.add(x, x + size, (chain_data.query_name, y, y + size, invert)) x += increment_x + size y += increment_y + size if y < 0: raise ValueError( "illegal mapping in chain {}".format(chain_data)) return map_chromosomes, map_contig2length
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option( "-e", "--exclusive-overlap", dest="exclusive", action="store_true", help="Intervals reported will be merged across the " "positive set and do not overlap any interval in any of the " "other sets [default=%default].") parser.add_option("-p", "--pattern-identifier", dest="pattern_id", type="string", help="pattern to convert a filename " "to an id [default=%default].") parser.add_option("-m", "--method", dest="method", type="choice", choices=("merged-combinations", "unmerged-combinations"), help="method to perform [default=%default]") parser.set_defaults( pattern_id="(.*).bed.gz", exclusive=False, method="merged-combinations", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv, add_output_options=True) if len(args) < 2: raise ValueError("at least two arguments required") tags, bedfiles = [], [] for infile in args: bedfiles.append(pysam.Tabixfile(infile, "r")) tags.append(re.search(options.pattern_id, infile).groups()[0]) indices = list(range(len(bedfiles))) is_exclusive = options.exclusive if options.method == "merged-combinations": if is_exclusive: start = 1 else: start = 2 options.stdout.write("combination\twithout\tcounts\n") for ncombinants in range(start, len(bedfiles) + 1): for combination in itertools.combinations(indices, ncombinants): other = [x for x in indices if x not in combination] tag = ":".join([tags[x] for x in combination]) E.debug("combination %s started" % tag) E.debug("other: %s" % ":".join([tags[x] for x in other])) other_bed = [bedfiles[x] for x in other] outf = iotools.open_file(E.get_output_file(tag), "w", create_dir=True) c = E.Counter() for contig, start, end in combineMergedIntervals( [bedfiles[x] for x in combination]): c.found += 1 if is_exclusive and isContainedInOne( contig, start, end, other_bed): c.removed += 1 continue c.output += 1 outf.write("%s\t%i\t%i\n" % (contig, start, end)) outf.close() E.info("combination %s finished: %s" % (tag, c)) options.stdout.write("%s\t%s\t%i\n" % (":".join([tags[x] for x in combination]), ":".join([tags[x] for x in other]), c.output)) elif options.method == "unmerged-combinations": options.stdout.write("track\tcombination\twithout\tcounts\n") for foreground in indices: start = 0 background = [x for x in indices if x != foreground] for ncombinants in range(0, len(background) + 1): for combination in itertools.combinations( background, ncombinants): other = [x for x in background if x not in combination] combination_bed = [bedfiles[x] for x in combination] other_bed = [bedfiles[x] for x in other] tag = ":".join([tags[foreground]] + [tags[x] for x in combination]) E.debug("fg=%i, combination=%s, other=%s" % (foreground, combination, other)) E.debug("combination %s started" % tag) E.debug("other: %s" % ":".join([tags[x] for x in other])) outf = iotools.open_file(E.get_output_file(tag), "w", create_dir=True) c = E.Counter() for bed in combineUnmergedIntervals( bedfiles[foreground], combination_bed): c.found += 1 if is_exclusive and isContainedInOne( bed.contig, bed.start, bed.end, other_bed): c.removed += 1 continue c.output += 1 outf.write("%s\n" % str(bed)) outf.close() E.info("combination %s finished: %s" % (tag, c)) options.stdout.write( "%s\t%s\t%s\t%i\n" % (tags[foreground], ":".join([ tags[x] for x in combination ]), ":".join([tags[x] for x in other]), c.output)) E.stop()
def buildSpikeResults(infile, outfile): '''build matrices with results from spike-in and upload into database. The method will output several files: .spiked.gz: Number of intervals that have been spiked-in for each bin of expression and fold-change .power.gz: Global power analysis - aggregates over all ranges of fold-change and expression and outputs the power, the proportion of intervals overall that could be detected as differentially methylated. This is a table with the following columns: fdr - fdr threshold power - power level, number of intervals detectable intervals - number of intervals in observed data at given level of fdr and power. intervals_percent - percentage of intervals in observed data at given level of fdr and power The method will also upload the results into the database. Arguments --------- infile : string Input filename in :term:`tsv` format. Usually the output of :mod:`scripts/runExpression`. outfile : string Output filename in :term:`tsv` format. ''' expression_nbins = 10 fold_nbins = 10 spikefile = P.snip(infile, '.tsv.gz') + '.spike.gz' if not os.path.exists(spikefile): E.warn('no spike data: %s' % spikefile) iotools.touch_file(outfile) return ######################################## # output and load spiked results tmpfile_name = P.get_temp_filename(shared=True) statement = '''zcat %(spikefile)s | grep -e "^spike" -e "^test_id" > %(tmpfile_name)s ''' P.run(statement) E.debug("outputting spiked counts") (spiked, spiked_d2hist_counts, xedges, yedges, spiked_l10average, spiked_l2fold) = \ outputSpikeCounts( outfile=P.snip(outfile, ".power.gz") + ".spiked.gz", infile_name=tmpfile_name, expression_nbins=expression_nbins, fold_nbins=fold_nbins) ######################################## # output and load unspiked results statement = '''zcat %(infile)s | grep -v -e "^spike" > %(tmpfile_name)s ''' P.run(statement) E.debug("outputting unspiked counts") (unspiked, unspiked_d2hist_counts, unspiked_xedges, unspiked_yedges, unspiked_l10average, unspiked_l2fold) = \ outputSpikeCounts( outfile=P.snip(outfile, ".power.gz") + ".unspiked.gz", infile_name=tmpfile_name, expression_bins=xedges, fold_bins=yedges) E.debug("computing power") assert xedges.all() == unspiked_xedges.all() tmpfile = iotools.open_file(tmpfile_name, "w") tmpfile.write("\t".join(("expression", "fold", "fdr", "counts", "percent")) + "\n") fdr_thresholds = [0.01, 0.05] + list(numpy.arange(0.1, 1.0, 0.1)) power_thresholds = numpy.arange(0.1, 1.1, 0.1) spiked_total = float(spiked_d2hist_counts.sum().sum()) unspiked_total = float(unspiked_d2hist_counts.sum().sum()) outf = iotools.open_file(outfile, "w") outf.write("fdr\tpower\tintervals\tintervals_percent\n") # significant results for fdr in fdr_thresholds: take = spiked['qvalue'] < fdr # compute 2D histogram in spiked data below fdr threshold spiked_d2hist_fdr, xedges, yedges = \ numpy.histogram2d(spiked_l10average[take], spiked_l2fold[take], bins=(xedges, yedges)) # convert to percentage of spike-ins per bin spiked_d2hist_fdr_normed = spiked_d2hist_fdr / spiked_d2hist_counts spiked_d2hist_fdr_normed = numpy.nan_to_num(spiked_d2hist_fdr_normed) # set values without data to -1 spiked_d2hist_fdr_normed[spiked_d2hist_counts == 0] = -1.0 # output to table for database upload for x, y in itertools.product(list(range(len(xedges) - 1)), list(range(len(yedges) - 1))): tmpfile.write("\t".join( map(str, (xedges[x], yedges[y], fdr, spiked_d2hist_fdr[x, y], 100.0 * spiked_d2hist_fdr_normed[x, y]))) + "\n") # take elements in spiked_hist_fdr above a certain threshold for power in power_thresholds: # select 2D bins at a given power level power_take = spiked_d2hist_fdr_normed >= power # select the counts in the unspiked data according # to this level power_counts = unspiked_d2hist_counts[power_take] outf.write("\t".join( map(str, (fdr, power, power_counts.sum().sum(), 100.0 * power_counts.sum().sum() / unspiked_total))) + "\n") tmpfile.close() outf.close() # upload into table method = P.snip(os.path.dirname(outfile), ".dir") tablename = P.to_table( P.snip(outfile, "power.gz") + method + ".spike.load") P.load(tmpfile_name, outfile + ".log", tablename=tablename, options="--add-index=fdr") os.unlink(tmpfile_name)
def outputSpikeCounts(outfile, infile_name, expression_nbins=None, fold_nbins=None, expression_bins=None, fold_bins=None): """count significant results in bins of expression and fold change. This method groups the results of a DE analysis in a 2-dimensonal histogramy by tag counts/expression level and fold change. Either supply one of `nbins` or `bins` for the histograms. Arguments --------- outfile : string Output filename infile_name : string Input filename in :term:`tsv` format. Usually the output of :mod:`scripts/runExpression`. expression_nbins : int Number of bins to use for tag count histogram. fold_nbins : int Number of bins to use for fold-change histogram. expression_bins : list List of bins to use for tag count histogram. fold_bins : list List of bins to use for fold-change histogram. """ df = pandas.read_csv(infile_name, sep="\t", index_col=0) E.debug("read %i rows and %i columns of data" % df.shape) if "edger" in outfile.lower(): # edger: treatment_mean and control_mean do not exist # use supplied values directly. l10average = numpy.log(df['treatment_mean']) l2fold = numpy.log2(df['fold']) else: # use pseudocounts to compute fold changes treatment_mean = df['treatment_mean'] + 1 control_mean = df['control_mean'] + 1 # build log2 average values l10average = numpy.log((treatment_mean + control_mean) / 2) l2fold = numpy.log2(treatment_mean / control_mean) if expression_nbins is not None: mm = math.ceil(max(l10average)) expression_bins = numpy.arange(0, mm, mm / expression_nbins) if fold_nbins is not None: mm = math.ceil(max(abs(min(l2fold)), abs(max(l2fold)))) # ensure that range is centered on exact 0 n = math.ceil(fold_nbins / 2.0) fold_bins = numpy.concatenate( (-numpy.arange(0, mm, mm / n)[:0:-1], numpy.arange(0, mm, mm / n))) # compute expression bins d2hist_counts, xedges, yedges = numpy.histogram2d(l10average, l2fold, bins=(expression_bins, fold_bins)) dd = pandas.DataFrame(d2hist_counts) dd.index = list(xedges[:-1]) dd.columns = list(yedges[:-1]) dd.to_csv(iotools.open_file(outfile, "w"), sep="\t") return df, d2hist_counts, xedges, yedges, l10average, l2fold
def main(argv=None): """script main. """ if not argv: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument("-o", "--output-format", dest="output_format", type=str, choices=("bedgraph", "wiggle", "bigbed", "bigwig", "bed"), help="output format [default=%default]") parser.add_argument("-s", "--shift-size", dest="shift", type=int, help="shift reads by a certain amount (ChIP-Seq) ") parser.add_argument("-e", "--extend", dest="extend", type=int, help="extend reads by a certain amount " "(ChIP-Seq) ") parser.add_argument("-p", "--wiggle-span", dest="span", type=int, help="span of a window in wiggle tracks ") parser.add_argument("-m", "--merge-pairs", dest="merge_pairs", action="store_true", help="merge paired-ended reads into a single " "bed interval [default=%default].") parser.add_argument("--scale-base", dest="scale_base", type=float, help="number of reads/pairs to scale bigwig file to. " "The default is to scale to 1M reads ") parser.add_argument( "--scale-method", dest="scale_method", type=str, choices=( "none", "reads", ), help="scale bigwig output. 'reads' will normalize by " "the total number reads in the bam file that are used " "to construct the bigwig file. If --merge-pairs is used " "the number of pairs output will be used for " "normalization. 'none' will not scale the bigwig file") parser.add_argument("--max-insert-size", dest="max_insert_size", type=int, help="only merge if insert size less that " "# bases. 0 turns of this filter ") parser.add_argument("--min-insert-size", dest="min_insert_size", type=int, help="only merge paired-end reads if they are " "at least # bases apart. " "0 turns of this filter.") parser.set_defaults( samfile=None, output_format="wiggle", shift=0, extend=0, span=1, merge_pairs=None, min_insert_size=0, max_insert_size=0, scale_method='none', scale_base=1000000, ) # add common options (-h/--help, ...) and parse command line (args, unknown) = E.start(parser, argv=argv, add_output_options=True, unknowns=True) if len(unknown) >= 1: args.samfile = unknown[0] if len(unknown) == 2: args.output_filename_pattern = unknown[1] if not args.samfile: raise ValueError("please provide a bam file") # Read BAM file using Pysam samfile = pysam.AlignmentFile(args.samfile, "rb") # Create temporary files / folders tmpdir = tempfile.mkdtemp() E.debug("temporary files are in %s" % tmpdir) tmpfile_wig = os.path.join(tmpdir, "wig") tmpfile_sizes = os.path.join(tmpdir, "sizes") # Create dictionary of contig sizes contig_sizes = dict(list(zip(samfile.references, samfile.lengths))) # write contig sizes outfile_size = iotools.open_file(tmpfile_sizes, "w") for contig, size in sorted(contig_sizes.items()): outfile_size.write("%s\t%s\n" % (contig, size)) outfile_size.close() # Shift and extend only available for bigwig format if args.shift or args.extend: if args.output_format != "bigwig": raise ValueError( "shift and extend only available for bigwig output") # Output filename required for bigwig / bigbed computation if args.output_format == "bigwig": if not args.output_filename_pattern: raise ValueError( "please specify an output file for bigwig computation.") # Define executable to use for binary conversion if args.output_format == "bigwig": executable_name = "wigToBigWig" else: raise ValueError("unknown output format `%s`" % args.output_format) # check required executable file is in the path executable = iotools.which(executable_name) if not executable: raise OSError("could not find %s in path." % executable_name) # Open outout file outfile = iotools.open_file(tmpfile_wig, "w") E.info("starting output to %s" % tmpfile_wig) else: outfile = iotools.open_file(tmpfile_wig, "w") E.info("starting output to stdout") # Set up output write functions if args.output_format in ("wiggle", "bigwig"): # wiggle is one-based, so add 1, also step-size is 1, so need # to output all bases if args.span == 1: outf = lambda outfile, contig, start, end, val: \ outfile.write( "".join(["%i\t%i\n" % (x, val) for x in range(start + 1, end + 1)])) else: outf = SpanWriter(args.span) elif args.output_format == "bedgraph": # bed is 0-based, open-closed outf = lambda outfile, contig, start, end, val: \ outfile.write("%s\t%i\t%i\t%i\n" % (contig, start, end, val)) # initialise counters ninput, nskipped, ncontigs = 0, 0, 0 # set output file name output_filename_pattern = args.output_filename_pattern if output_filename_pattern: output_filename = os.path.abspath(output_filename_pattern) # shift and extend or merge pairs. Output temporay bed file if args.shift > 0 or args.extend > 0 or args.merge_pairs: # Workflow 1: convert to bed intervals and use bedtools # genomecov to build a coverage file. # Convert to bigwig with UCSC tools bedGraph2BigWig if args.merge_pairs: # merge pairs using bam2bed E.info("merging pairs to temporary file") counter = merge_pairs(samfile, outfile, min_insert_size=args.min_insert_size, max_insert_size=args.max_insert_size, bed_format=3) E.info("merging results: {}".format(counter)) if counter.output == 0: raise ValueError("no pairs output after merging") else: # create bed file with shifted/extended tags shift, extend = args.shift, args.extend shift_extend = shift + extend counter = E.Counter() for contig in samfile.references: E.debug("output for %s" % contig) lcontig = contig_sizes[contig] for read in samfile.fetch(contig): pos = read.pos if read.is_reverse: start = max(0, read.pos + read.alen - shift_extend) else: start = max(0, read.pos + shift) # intervals extending beyond contig are removed if start >= lcontig: continue end = min(lcontig, start + extend) outfile.write("%s\t%i\t%i\n" % (contig, start, end)) counter.output += 1 outfile.close() if args.scale_method == "reads": scale_factor = float(args.scale_base) / counter.output E.info("scaling: method=%s scale_quantity=%i scale_factor=%f" % (args.scale_method, counter.output, scale_factor)) scale = "-scale %f" % scale_factor else: scale = "" # Convert bed file to coverage file (bedgraph) tmpfile_bed = os.path.join(tmpdir, "bed") E.info("computing coverage") # calculate coverage - format is bedgraph statement = """bedtools genomecov -bg -i %(tmpfile_wig)s %(scale)s -g %(tmpfile_sizes)s > %(tmpfile_bed)s""" % locals() E.run(statement) # Convert bedgraph to bigwig E.info("converting to bigwig") tmpfile_sorted = os.path.join(tmpdir, "sorted") statement = ("sort -k 1,1 -k2,2n %(tmpfile_bed)s > %(tmpfile_sorted)s;" "bedGraphToBigWig %(tmpfile_sorted)s %(tmpfile_sizes)s " "%(output_filename_pattern)s" % locals()) E.run(statement) else: # Workflow 2: use pysam column iterator to build a # wig file. Then convert to bigwig of bedgraph file # with UCSC tools. def column_iter(iterator): start = None end = 0 n = None for t in iterator: if t.pos - end > 1 or n != t.n: if start is not None: yield start, end, n start = t.pos end = t.pos n = t.n end = t.pos yield start, end, n if args.scale_method != "none": raise NotImplementedError( "scaling not implemented for pileup method") # Bedgraph track definition if args.output_format == "bedgraph": outfile.write("track type=bedGraph\n") for contig in samfile.references: # if contig != "chrX": continue E.debug("output for %s" % contig) lcontig = contig_sizes[contig] # Write wiggle header if args.output_format in ("wiggle", "bigwig"): outfile.write("variableStep chrom=%s span=%i\n" % (contig, args.span)) # Generate pileup per contig using pysam and iterate over columns for start, end, val in column_iter(samfile.pileup(contig)): # patch: there was a problem with bam files and reads # overextending at the end. These are usually Ns, but # need to check as otherwise wigToBigWig fails. if lcontig <= end: E.warn("read extending beyond contig: %s: %i > %i" % (contig, end, lcontig)) end = lcontig if start >= end: continue if val > 0: outf(outfile, contig, start, end, val) ncontigs += 1 # Close output file if type(outf) == type(SpanWriter): outf.flush(outfile) else: outfile.flush() E.info("finished output") # Report counters E.info("ninput=%i, ncontigs=%i, nskipped=%i" % (ninput, ncontigs, nskipped)) # Convert to binary formats if args.output_format == "bigwig": outfile.close() E.info("starting %s conversion" % executable) try: retcode = subprocess.call(" ".join( (executable, tmpfile_wig, tmpfile_sizes, output_filename_pattern)), shell=True) if retcode != 0: E.warn("%s terminated with signal: %i" % (executable, -retcode)) return -retcode except OSError as msg: E.warn("Error while executing bigwig: %s" % msg) return 1 E.info("finished bigwig conversion") else: with open(tmpfile_wig) as inf: sys.stdout.write(inf.read()) # Cleanup temp files shutil.rmtree(tmpdir) E.stop()
def compareCheckSums(infiles, outfile): '''compare checksum files against existing reference data. ''' outf = iotools.open_file(outfile, "w") outf.write("\t".join(( ("track", "status", "job_finished", "nfiles", "nref", "missing", "extra", "different", "different_md5", "different_lines", "same", "same_md5", "same_lines", "same_exist", "files_missing", "files_extra", "files_different_md5", "files_different_lines"))) + "\n") for infile in infiles: E.info("working on {}".format(infile)) track = P.snip(infile, ".stats") logfiles = glob.glob(track + "*.log") job_finished = True for logfile in logfiles: is_complete = iotools.is_complete(logfile) E.debug("logcheck: {} = {}".format(logfile, is_complete)) job_finished = job_finished and is_complete reffile = track + ".ref" # regular expression of files to test only for existence regex_exist = PARAMS.get('%s_regex_exist' % track, None) if regex_exist: regex_exist = re.compile("|".join(P.as_list(regex_exist))) regex_linecount = PARAMS.get('%s_regex_linecount' % track, None) if regex_linecount: regex_linecount = re.compile("|".join(P.as_list(regex_linecount))) regex_md5 = PARAMS.get('%s_regex_md5' % track, None) if regex_md5: regex_md5 = re.compile("|".join(P.as_list(regex_md5))) if not os.path.exists(reffile): raise ValueError('no reference data defined for %s' % track) cmp_data = pandas.read_csv(iotools.open_file(infile), sep="\t", index_col=0) ref_data = pandas.read_csv(iotools.open_file(reffile), sep="\t", index_col=0) shared_files = set(cmp_data.index).intersection(ref_data.index) missing = set(ref_data.index).difference(cmp_data.index) extra = set(cmp_data.index).difference(ref_data.index) different = set(shared_files) # remove those for which only check for existence if regex_exist: same_exist = set([x for x in different if regex_exist.search(x)]) different = set( [x for x in different if not regex_exist.search(x)]) else: same_exist = set() # select those for which only check for number of lines if regex_linecount: check_lines = [x for x in different if regex_linecount.search(x)] dd = (cmp_data['nlines'][check_lines] != ref_data['nlines'][check_lines]) different_lines = set(dd.index[dd]) different = different.difference(check_lines) dd = (cmp_data['nlines'][check_lines] == ref_data['nlines'] [check_lines]) same_lines = set(dd.index[dd]) else: different_lines = set() same_lines = set() # remainder - check md5 if regex_md5: check_md5 = [x for x in different if regex_md5.search(x)] dd = (cmp_data['md5'][check_md5] != ref_data['md5'][check_md5]) different_md5 = set(dd.index[dd]) dd = (cmp_data['md5'][check_md5] == ref_data['md5'][check_md5]) same_md5 = set(dd.index[dd]) else: different_md5 = set() same_md5 = set() if job_finished and (len(missing) + len(extra) + len(different_md5) + len(different_lines) == 0): status = "OK" else: status = "FAIL" outf.write("\t".join( map(str, ( track, status, job_finished, len(cmp_data), len(ref_data), len(missing), len(extra), len(different_md5) + len(different_lines), len(different_md5), len(different_lines), len(same_md5) + len(same_lines) + len(same_exist), len(same_md5), len(same_lines), len(same_exist), ",".join(missing), ",".join(extra), ",".join(different_md5), ",".join(different_lines), ))) + "\n") outf.close()
def runDRMAA(data, environment): '''run jobs in data using drmaa to connect to the cluster.''' # SNS: Error dection now taken care of with Cluster.py # expandStatement function # working directory - needs to be the one from which the # the script is called to resolve input files. cwd = os.getcwd() session = drmaa.Session() session.initialize() jobids = [] kwargs = {} for filename, cmd, options, tmpdir, subdirs in data: from_stdin, to_stdout = True, True if subdirs: outdir = "%s.dir/" % (filename) os.mkdir(outdir) cmd = re.sub("%DIR%", outdir, cmd) x = re.search("'--log=(\S+)'", cmd) or re.search("'--L\s+(\S+)'", cmd) if x: logfile = filename + ".log" cmd = cmd[:x.start()] + "--log=%s" % logfile + cmd[x.end():] else: logfile = filename + ".out" if "%STDIN%" in cmd: cmd = re.sub("%STDIN%", filename, cmd) from_stdin = False if "%STDOUT%" in cmd: cmd = re.sub("%STDOUT%", filename + ".out", cmd) to_stdout = False cmd = " ".join(re.sub("\t+", " ", cmd).split("\n")) E.info("running statement:\n%s" % cmd) job_script = tempfile.NamedTemporaryFile(dir=os.getcwd(), delete=False, mode="w+t") job_script.write("#!/bin/bash\n") # -l -O expand_aliases\n" ) job_script.write(Cluster.expandStatement(cmd) + "\n") job_script.close() job_path = os.path.abspath(job_script.name) os.chmod(job_path, stat.S_IRWXG | stat.S_IRWXU) # get session for process - only one is permitted job_name = os.path.basename(kwargs.get("outfile", "farm.py")) options_dict = vars(options) options_dict["workingdir"] = os.getcwd() if options.job_memory: job_memory = options.job_memory elif options.cluster_memory_default: job_memory = options.cluster_memory_default else: job_memory = "2G" jt = Cluster.setupDrmaaJobTemplate(session, options_dict, job_name, job_memory) jt.remoteCommand = job_path # update the environment e = {'BASH_ENV': options.bashrc} if environment: for en in environment: try: e[en] = os.environ[en] except KeyError: raise KeyError( "could not export environment variable '%s'" % en) jt.jobEnvironment = e # SNS: Native specifation setting abstracted # to Pipeline/Cluster.setupDrmaaJobTemplate() # use stdin for data if from_stdin: jt.inputPath = ":" + filename # set paths. # later: allow redirection of stdout and stderr to files # could this even be across hosts? if to_stdout: jt.outputPath = ":" + filename + ".out" else: jt.outputPath = ":" + filename + ".stdout" jt.errorPath = ":" + filename + ".err" jobid = session.runJob(jt) jobids.append((jobid, job_path, filename, cmd, logfile)) E.debug("%i jobs have been submitted" % len(jobids)) results = [] for jobid, job_path, filename, cmd, logfile in jobids: try: retval = session.wait(jobid, drmaa.Session.TIMEOUT_WAIT_FOREVER) except Exception as msg: # ignore message 24 in PBS # code 24: drmaa: Job finished but resource usage information # and/or termination status could not be provided.": if not msg.message.startswith("code 24"): raise retval = None if retval and retval.exitStatus != 0: raise OSError("Child was terminated by signal %i: \n%s\n" % (retval.exitStatus, cmd)) results.append((retval, filename, cmd, logfile, 1)) os.unlink(job_path) session.deleteJobTemplate(jt) session.exit()
def annotateGenome(iterator, fasta, options): """perform a full segmentation of the genome (UTR, exon, intron ...) """ ninput, noutput, nadded, nambiguous, nframeshifts, nunknown = 0, 0, 0, 0, 0, 0 last = None is_ambiguous = False for this in iterator: ninput += 1 E.debug("last=%s" % str(last)) E.debug("this=%s" % str(this)) E.debug("is_ambiguous=%s" % str(is_ambiguous)) if last and last.contig == this.contig: # check if file is sorted correctly assert last.start <= this.start, "input file needs to be sorted by contig, start" if last.end <= this.start: if not is_ambiguous: if last.gene_id != this.gene_id: nadded += addIntergenicSegment(last, this, fasta, options) else: d = this.start - last.end if d >= options.min_intron_length: nadded += addSegment("intronic", last.end, this.start, last, options) elif d <= options.max_frameshift_length: nframeshifts += addSegment("frameshift", last.end, this.start, last, options) else: nunknown += addSegment("unknown", last.end, this.start, last, options) else: if last.feature == this.feature and \ last.gene_id == this.gene_id: nambiguous += addSegment(last.feature, last.end, this.start, last, options) else: nambiguous += addSegment("ambiguous", last.end, this.start, last, options) is_ambiguous = False last = this elif last.end > this.start: if last.gene_id != this.gene_id: # flag next region as ambiguous is_ambiguous = True last.end = this.end else: nadded += addIntergenicSegment(last, None, fasta, options) nadded += addIntergenicSegment(None, this, fasta, options) last = this options.stdout.write("%s\n" % str(this)) noutput += 1 E.info( "ninput=%i, noutput=%i, nadded=%i, nambiguous=%i, nframeshifts=%i, nunknown=%i" % (ninput, noutput, nadded, nambiguous, nframeshifts, nunknown))
def main(argv=None): parser = getOptionParser() (options, args) = E.Start(parser, add_cluster_options=True) if len(args) == 0: raise ValueError( "command line argument missing - see usage information") options.renumber_column = [x.split(":") for x in options.renumber_column] cmd = args[0] if len(args) > 1: cmd += " '" + "' '".join(args[1:]) + "'" if options.dry_run: cmd = re.sub("%DIR%", "", cmd) retcode = subprocess.call(cmd, shell=True, stdin=sys.stdin, stdout=sys.stdout, cwd=os.getcwd(), close_fds=True) E.Stop() sys.exit(0) failed_requests = [] started_requests = [] niterations = 0 if not options.collect: tmpdir = os.path.abspath(tempfile.mkdtemp(dir=options.tmpdir)) E.info(" working in directory %s" % tmpdir) if options.split_at_lines: chunk_iterator = chunk_iterator_lines args = (options.split_at_lines, ) elif options.split_at_column: chunk_iterator = chunk_iterator_column args = (options.split_at_column - 1, options.max_files) elif options.split_at_regex: chunk_iterator = chunk_iterator_regex_split args = (re.compile(options.split_at_regex), 0, options.chunksize, options.max_lines) elif options.group_by_regex: chunk_iterator = chunk_iterator_regex_group args = (re.compile(options.group_by_regex), 0, options.chunksize) else: raise ValueError("please specify a way to chunk input data") data = [(x, cmd, options, None, options.subdirs) for x in chunk_iterator(options.stdin, args, prefix=tmpdir, use_header=options.input_header)] started_requests = [(x[0], x[0] + ".out") for x in data] if len(data) == 0: E.warn("no data received") E.Stop() sys.exit(0) if options.method == "multiprocessing": pool = Pool(options.cluster_num_jobs) results = pool.map(runCommand, data, chunksize=1) elif options.method == "drmaa": results = [] runDRMAA(data, environment=options.environment) elif options.method == "threads": pool = ThreadPool(options.cluster_num_jobs) results = pool.map(runCommand, data, chunksize=1) niterations = 0 for retcode, filename, cmd, logfile, iterations in results: niterations += iterations if not hasFinished(retcode, filename, options.output_tag, logfile): failed_requests.append((filename, cmd)) else: tmpdir = options.collect started_requests = [(x[:-4], x) for x in glob.glob(tmpdir + "/*.out")] E.info("collecting %i files from %s" % (len(started_requests), tmpdir)) if failed_requests: for fn, cmd in failed_requests: E.error("failed request: filename= %s, cmd= %s" % (fn, cmd)) else: E.info("building result from %i parts" % len(started_requests)) if options.renumber: mapper = MapperLocal(pattern=options.renumber) else: mapper = MapperEmpty() # deal with stdout name = None index = None for pattern, column in options.renumber_column: if re.search(pattern, "stdout"): try: index = int(column) - 1 except ValueError: name = column break if options.binary: ResultBuilderBinary()(started_requests, options.stdout, options) else: regex = None if options.output_regex_header: regex = re.compile(options.output_regex_header) ResultBuilder(mapper=mapper, field_index=index, field_name=name, header_regex=regex)(started_requests, options.stdout, options) # deal with logfiles : combine them into a single file rr = re.search("'--log=(\S+)'", cmd) or re.search("'--L\s+(\S+)'", cmd) if rr: E.info("logging output goes to %s" % rr.groups()[0]) logfile = iotools.openFile(rr.groups()[0], "a") ResultBuilderLog()([(x[0], "%s.log" % x[0]) for x in started_requests], logfile, options) logfile.close() # deal with other files if options.subdirs: files = glob.glob("%s/*.dir/*" % tmpdir) # remove directory filenames = set([os.path.basename(x) for x in files]) xx = len(".out") for filename in filenames: _, filetype = os.path.splitext(filename) name = None index = None for pattern, column in options.renumber_column: if re.search(pattern, filename): try: index = int(column) - 1 except ValueError: name = column break if options.binary: builder = ResultBuilderBinary(mapper=mapper) elif filetype in (".fa", ".fasta"): builder = ResultBuilderFasta(mapper=mapper) elif filetype in (".mali", ): builder = ResultBuilderFasta(mapper=MapperEmpty()) elif filetype in (".psl"): builder = ResultBuilderPSL(mapper=mapper) elif filetype in (".gtf", ".gff"): builder = ResultBuilderGFF(mapper=mapper, field_index=index, field_name=name) elif filetype in (".png"): builder = ResultBuilderCopies(mapper=mapper) else: builder = ResultBuilder(mapper=mapper, field_index=index, field_name=name) E.debug("chose the following builder for %s: %s: %s" % (filename, filetype, str(builder))) E.info("collecting results for %s" % filename) input_filenames = [] for fi, fn in started_requests: fn = fn[:-xx] + ".dir/" + filename if os.path.exists(fn): input_filenames.append((fi, fn)) E.info("output of %i files goes to %s" % (len(filenames), filename)) outfile = iotools.openFile(options.output_pattern % filename, "w") builder(input_filenames, outfile, options) outfile.close() if not options.debug and (not options.resume or not options.collect): if len(failed_requests) == 0: E.info("removing directory %s" % tmpdir) shutil.rmtree(tmpdir) else: E.info("directory %s not removed due to %i failed jobs" % (tmpdir, len(failed_requests))) E.info("job control: nstarted=%i, nfinished=%i, nerrors=%i, nrepeats=%i" % (len(started_requests), len(started_requests) - len(failed_requests), len(failed_requests), niterations)) E.Stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser(version="%prog version: $Id$", usage=globals()["__doc__"]) parser.add_option("-m", "--method", dest="method", type="choice", choices=('join', ), help="method to apply [default=%default].") parser.set_defaults(method="join", ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) if len(args) != 2: raise ValueError( "please supply at least two fastq files on the commandline") fn1, fn2 = args c = E.Counter() outfile = options.stdout if options.method == "join": # merge based on diagonals in dotplot iter1 = Fastq.iterate(iotools.open_file(fn1)) iter2 = Fastq.iterate(iotools.open_file(fn2)) tuple_size = 2 for left, right in zip(iter1, iter2): c.input += 1 # build dictionary of tuples s1, q1 = left.seq, left.quals d = collections.defaultdict(list) for x in range(len(s1) - tuple_size): d[s1[x:x + tuple_size]].append(x) s2, q2 = right.seq, right.quals s2 = Genomics.reverse_complement(s2) q2 = q2[::-1] # compute list of offsets/diagonals offsets = collections.defaultdict(int) for x in range(len(s2) - tuple_size): c = s2[x:x + tuple_size] for y in d[c]: offsets[x - y] += 1 # find maximum diagonal sorted = sorted([(y, x) for x, y in list(offsets.items())]) max_count, max_offset = sorted[-1] E.debug('%s: maximum offset at %i' % (left.identifier, max_offset)) # simple merge sequence take = len(s2) - max_offset merged_seq = s1 + s2[take:] # simple merge quality scores merged_quals = q1 + q2[take:] new_entry = copy.copy(left) new_entry.seq = merged_seq new_entry.quals = merged_quals outfile.write(new_entry) c.output += 1 # write footer and output benchmark information. E.info("%s" % str(c)) E.stop()
def loadTagDataPandas(tags_filename, design_filename): '''load tag data for deseq/edger analysis. *Infile* is a tab-separated file with counts. *design_file* is a tab-separated file with the experimental design with four columns:: track include group pair CW-CD14-R1 0 CD14 1 CW-CD14-R2 0 CD14 1 CW-CD14-R3 1 CD14 1 CW-CD4-R1 1 CD4 1 FM-CD14-R1 1 CD14 2 FM-CD4-R2 0 CD4 2 FM-CD4-R3 0 CD4 2 FM-CD4-R4 0 CD4 2 track name of track - should correspond to column header in *infile* include flag to indicate whether or not to include this data group group indicator - experimental group pair pair that sample belongs to (for paired tests) This method creates various R objects: countsTable : data frame with counts. groups : vector with groups pairs : vector with pairs ''' counts_table = pd.read_table(tags_filename, sep="\t", index_col=0, comment="#") E.info("read data: %i observations for %i samples" % counts_table.shape) E.debug("sample names: %s" % list(counts_table.columns)) inf = iotools.open_file(design_filename) design_table = pd.read_csv(inf, sep="\t", index_col=0) inf.close() E.debug("design names: %s" % list(design_table.index)) missing = set(counts_table.columns).difference(design_table.index) if missing: E.warn("missing samples from design file are ignored: %s" % missing) # remove unnecessary samples design_table = design_table[design_table["include"] != 0] E.debug("included samples: %s" % list(design_table.index)) counts_table = counts_table[list(design_table.index)] E.info("filtered data: %i observations for %i samples" % counts_table.shape) return counts_table, design_table
def main(argv): options = P.initialize(argv, config_file="benchmark.yml") # compatibility with cgatcore < 0.6.3 if isinstance(options, tuple): options = options[0] # not sure what this does # if not options.config_file: # P.get_parameters(options.config_file) # else: # sys.exit(P.main(options, args)) params = P.get_params() with arvados_enabled(always_mount=options.always_mount): mountpoint = params.get("mount_point", None) if mountpoint: redirect_defaults2mountpoint(mountpoint) # A selection of command line arguments are added to PARAMS # as 'extras' not implemented in ruffus 2.6.3 kwargs = collections.defaultdict(dict) if options.only_info: kwargs["extras"].update({'only_info': True}) P.PARAMS["only_info"] = True if options.is_test: kwargs["extras"].update({'is_test': True}) P.PARAMS["is_test"] = True E.debug("construction of workflow started") pipeline = ruffus.Pipeline('benchmark') # Tool execution suffix, tool_runners = add_tools_to_pipeline(pipeline, map_tool_to_runner, config=P.PARAMS, **kwargs) E.debug("added {} tools to workflow".format(len(tool_runners))) # Optionally, add externally computed files as # pseudo-tools: if "external" in P.PARAMS["setup"]: external_runners = add_external_data_to_pipeline(pipeline, config=P.PARAMS, **kwargs) tool_runners.extend(external_runners) # Optionally, combine tool runs into aggregate # outputs. The type of the output is preserved # (VCF -> VCF, etc.) # For example, call individual members in a trio # and then build a combined VCF to analyse mendelian # inconsistencies. if "collate" in P.PARAMS["setup"]: collate_runners = add_collations_to_pipeline( pipeline, map_collate_to_runner, P.PARAMS["setup"]["collate"], tasks=tool_runners, config=P.PARAMS) if P.PARAMS["setup"].get("only_collate", False): tool_runners = [] if P.PARAMS["setup"].get("no_collate_metrics", False): collate_runners = [] E.debug("added {} collators to workflow".format( len(collate_runners))) else: collate_runners = [] # Optionally, split up the output before applying # additional analyses. The type of the output is preserved # (VCF -> VCF, etc). # For example, identify false positives, false negatives # and true positives and collect metrics individually. if "split" in P.PARAMS["setup"]: split_runners = add_splits_to_pipeline(pipeline, map_split_to_runner, tool_runners, P.PARAMS["setup"]["split"], tasks=tool_runners, config=P.PARAMS) if P.PARAMS["setup"].get("only_split", False): tool_runners = [] E.debug("added {} splitters to workflow".format( len(split_runners))) else: split_runners = [] metric_runners = [] for prefix, r in zip(["tool", "collate", "split"], [tool_runners, collate_runners, split_runners]): if not r: continue metrics = None if prefix == "collate" and "collate_metrics" in P.PARAMS["setup"]: metrics = P.PARAMS["setup"]["collate_metrics"] elif prefix == "split" and "split_metrics" in P.PARAMS["setup"]: metrics = P.PARAMS["setup"]["split_metrics"] elif "metrics" in P.PARAMS["setup"]: metrics = P.PARAMS["setup"]["metrics"] else: raise KeyError( "configuration file requires a 'setup:metrics' section") # Metric execution mm = add_metrics_to_pipeline(pipeline, metrics, map_metric_to_runner, r, suffix=suffix, prefix=prefix + "_", config=P.PARAMS, **kwargs) if len(mm) == 0: raise ValueError( "workflow construction error: " "no metric tasks result for metrics {}".format(metrics)) metric_runners.extend(mm) E.debug("added {} {}_metrics to workflow".format(len(mm), prefix)) # add plot task if "aggregate" in P.PARAMS["setup"]: aggregate_metrics = add_collations_to_pipeline( pipeline, map_collate_to_runner, P.PARAMS["setup"]["aggregate"], metric_runners, config=P.PARAMS) E.debug("added metric aggregation to workflow") else: aggregate_metrics = [] add_upload_to_pipeline(pipeline, metric_runners + aggregate_metrics, P.PARAMS) E.debug("added upload to workflow".format(prefix)) # add export task export = P.PARAMS["setup"].get("export", ["tools", "collate", "split"]) map_export2runner = { "collate": collate_runners, "tools": tool_runners, "split": split_runners } export_runners = [] for e in export: try: export_runners.extend(map_export2runner[e]) except KeyError: raise KeyError("unknown export section: {}".format(e)) add_export_to_pipeline(pipeline, export_runners, suffix=suffix, config=P.PARAMS) E.debug("added export to workflow") add_all_task_to_pipeline(pipeline, metric_runners + aggregate_metrics) # Collate output files to facilitate analysis if "collation" in P.PARAMS: collators = add_collations_to_pipeline(pipeline, map_collate_to_runner, P.PARAMS["collation"], config=P.PARAMS) E.debug("construction of workflow completed") E.debug("starting workflow") P.run_workflow(options, pipeline=pipeline)
def save_table(table: pandas.DataFrame, url: str, tablename: str, schema: str = None, dtypes=None, indices=["instance_id"]): logger = P.get_logger() table.columns = sql_sanitize_columns(table.columns) engine = create_engine(url) # pandas/sqlite3 prefers the raw connection, otherwise error: # AttributeError: 'Engine' object has no attribute 'rollback' if url.startswith("sqlite"): _engine = engine.raw_connection() # In pandas >= 0.23 and using sqlite as a backend, the # pandas.DataFrame.to_sql command fails with "OperationalError: # (sqlite3.OperationalError) too many SQL variables". The reason is a # fixed limit in sqlite, SQLITE_MAX_VARIABLE_NUMBER, which is by # default set to 999. sql_chunk_size = 999 // (len(table.columns) + 1) else: _engine = engine sql_chunk_size = None # lower case all table names. Otherwise issues with psql # mixed case access tablename = tablename.lower() create_index = False try: retry_table_to_sql(table, tablename, _engine, schema=schema, if_exists="fail", index=False, dtype=dtypes, chunksize=sql_chunk_size) E.debug(f"table {tablename} was new") create_index = True except TableExistsException: E.debug(f"table {tablename} already exists - appending") if create_index: # sqlite requires an index name if schema: tablename = "{}.{}".format(schema, tablename) for field in indices: E.debug(f"creating index on {field} for {tablename}") try: retry_sql_execute( _engine, str( text("CREATE INDEX {} ON {} ({})".format( re.sub("[-.]", "_", tablename) + "_" + field, tablename, field)))) except IndexExistsException: pass except TypeError as ex: logger.warn("could not create index: {}".format(str(ex))) except sqlalchemy.exc.ProgrammingError as ex: logger.warn("could not create index: {}".format(str(ex))) else: reconcile_columns(tablename, engine, table) retry_table_to_sql(table, tablename, _engine, schema=schema, if_exists="append", index=False, dtype=dtypes, chunksize=sql_chunk_size)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("--inplace", dest="inplace", action="store_true", help="update option list in place. New options will" "be added to the list given by --options-tsv-file. " "Options will only be added, not removed ") parser.add_argument("--options-tsv-file", dest="tsv_file", type=str, help="existing table with options. Will be updated if " "--in-place is set [default]") parser.set_defaults(inplace=False, tsv_file=None) # add common options (-h/--help, ...) and parse command line (args) = E.start(parser, argv=argv) old_options = None if args.tsv_file: if not os.path.exists(args.tsv_file): raise OSError("filename %s not found, see --options-tsv-file" % args.tsv_file) old_options = pandas.read_csv( iotools.open_file(args.tsv_file), sep="\t", index_col=0, ) old_options = old_options.fillna("") global ORIGINAL_START ORIGINAL_START = E.start all_options = collections.defaultdict(list) for label, expression in EXPRESSIONS: files = glob.glob(expression) files.sort() for f in files: E.debug("processing %s" % f) if os.path.isdir(f): continue if os.path.basename(f) in EXCLUDE: continue collected_options = collectOptionsFromScript(os.path.abspath(f)) for o in collected_options: all_options[o].append(f) # add old options for x in old_options.index: if x not in all_options: all_options[x].append("--") if args.inplace: outfile = iotools.open_file(args.tsv_file, "w") E.info("updating file '%s'" % args.tsv_file) else: outfile = args.stdout outfile.write("option\taction\tcomment\talternative\tfiles\n") for o, v in sorted(all_options.items()): try: action, comment, alternative, ff = old_options.xs(o) except KeyError: action, comment, alternative, ff = "", "", "", "" if comment == "nan": comment = "" if alternative == "nan": alternative = "" outfile.write("\t".join( (list(map(str, (o, action, comment, alternative, ",".join(v)))))) + "\n") if outfile != args.stdout: outfile.close() # write footer and output benchmark information. E.stop()
def __del__(self): E.debug(f"closing table cache {id(self)}") self.close()
def __enter__(self): table_cache = TableCache(self.database_url, self.schema) E.debug(f"{os.getpid()}: created resource={id(self)}: cache={id(table_cache)}") self.table_cache = table_cache return self
for glob_expression, template, dest in dirs: if not os.path.exists(dest): os.mkdir(dest) files = glob.glob(os.path.abspath(glob_expression)) for filename in files: dirname, name = os.path.split(filename) prefix = name[:-3] # if os.path.exists( os.path.join( dirname, "_%s.pyx" % prefix )): # E.warn( "ignoring pyximport file _%s.pyx" % prefix ) # continue filename = os.path.join(os.path.abspath(dest), "%s.rst" % prefix) if os.path.exists(filename): nskipped += 1 continue E.debug("adding %s" % filename) outfile = open(filename, "w") outfile.write(template % locals()) outfile.close() ncreated += 1 E.info("ncreated=%i, nskipped=%i" % (ncreated, nskipped)) E.Stop()