def iterator_sorted(gff_iterator, sort_order="gene"): '''sort input and yield sorted output.''' entries = list(gff_iterator) if sort_order == "gene": entries.sort(key=lambda x: (x.gene_id, x.contig, x.start)) elif sort_order == "gene+transcript": entries.sort( key=lambda x: (x.gene_id, x.transcript_id, x.contig, x.start)) elif sort_order == "contig+gene": entries.sort( key=lambda x: (x.contig, x.gene_id, x.transcript_id, x.start)) elif sort_order == "transcript": entries.sort(key=lambda x: (x.transcript_id, x.contig, x.start)) elif sort_order == "position": entries.sort(key=lambda x: (x.contig, x.start)) elif sort_order == "position+gene": entries.sort(key=lambda x: (x.gene_id, x.start)) genes = list(flat_gene_iterator(entries)) genes.sort(key=lambda x: (x[0].contig, x[0].start)) entries = iotools.flatten(genes) elif sort_order == "gene+exon": entries.sort(key=lambda x: (x.gene_id, x.exon_number)) for entry in entries: yield entry
dbh = sqlite3.connect(P.get_params()['database']) return dbh @transform(P.get_params()["input_globs"].get("default", INPUT_FORMATS), regex("(.*)"), r"\1") def unprocessReads(infiles, outfiles): """dummy task - no processing of reads.""" # if preprocess tools are specified, preprocessing is done on output that has # already been generated in the first run if P.get_params().get("preprocessors", None): if P.get_params()["auto_remove"]: # check if FastQC has been run for x in iotools.flatten([glob.glob(y) for y in P.get_params()["input_globs"].get("default", INPUT_FORMATS)]): f = "fastqc.dir/" + re.match(REGEX_TRACK, x).group(1) + ".fastqc" if not os.path.exists(f): raise ValueError( "file %s missing, " "you need to run the pipeline once before " "specifying 'auto_remove'" % f) @follows(mkdir("fasta.dir")) @transform(unprocessReads, regex(SEQUENCEFILES_REGEX), r"fasta.dir/\1.fasta") def makeAdaptorFasta(infile, outfile): '''Make a single fasta file for each sample of all contaminant adaptor sequences for removal '''
def save_metric_data(meta_data, table_cache, schema, instance_id: int, session): logger = P.get_logger() metric_table_filter = None if "metric_no_upload" in meta_data: if meta_data["metric_no_upload"] == "*": logger.warn("upload turned off for metric {}".format( meta_data["metric_name"])) return else: metric_table_filter = re.compile(meta_data["metric_no_upload"]) # multiple tablenames for multiple metric output # # Tables are added into schemas to avoid cluttering # the public namespace. # (if only blobs, no metric output file) if "metric_output_files" in meta_data: assert len(meta_data["metric_output_files"]) == \ len(meta_data["metric_tablenames"]) for output_file, tablename in zip( meta_data["metric_output_files"], meta_data["metric_tablenames"]): if metric_table_filter and metric_table_filter.search(tablename): logger.warn("upload for table {} turned off".format( tablename)) continue if not os.path.exists(output_file): logger.warning("output file {} does not exist - ignored".format( output_file)) continue if IOTools.is_empty(output_file): logger.warn("output file {} is empty - ignored".format( output_file)) continue # table = pandas.DataFrame({"values": [1, 2]}) try: table = pandas.read_csv(output_file, sep="\t", comment="#", skip_blank_lines=True) except ValueError as e: logger.warn("table {} can not be read: {}".format( output_file, str(e))) continue except pandas.parser.CParserError as e: logger.warn("malformatted table {} can not be read: {}".format( output_file, str(e))) continue if table.empty: logger.warn("table {} is empty - ignored".format(output_file)) continue tablename, table, dtypes = transform_table_before_upload(tablename, table, instance_id, meta_data, table_cache) if schema is None: tn = tablename else: tn = "{}.{}".format(schema, tablename) # add foreign key table["instance_id"] = instance_id logger.debug(f"saving data {table.shape} from {output_file} to table {tn} under {instance_id}") table_cache.add_table(table, tablename, dtypes) if "metric_blob_globs" in meta_data: metric_dir = meta_data["metric_outdir"] files = [glob.glob(os.path.join(metric_dir, x)) for x in meta_data["metric_blob_globs"]] files = IOTools.flatten(files) logger.debug( "uploading binary data in {} files from {} to " "table binary_data".format(len(files), metric_dir)) table = [] for fn in files: with IOTools.open_file(fn, "rb", encoding=None) as inf: data_row = BenchmarkBinaryData( instance_id=instance_id, filename=os.path.basename(fn), path=fn, data=inf.read()) session.add(data_row) session.commit() if meta_data.get("metric_tableindices", None): table_cache.add_indices(meta_data["metric_tableindices"])
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.OptionParser( version= "%prog version: $Id: diff_bed.py 2866 2010-03-03 10:18:49Z andreas $", usage=globals()["__doc__"]) parser.add_option( "-u", "--update", dest="filename_update", type="string", help= "if filename is given, previous results will be read from there and only changed sets will be computed [default=%default]." ) parser.add_option( "-p", "--pattern-identifier", dest="pattern_id", type="string", help="pattern to convert a filename to an id [default=%default].") parser.add_option( "-t", "--tracks", dest="tracks", action="store_true", help= "compare files against all tracks in the first file [default=%default]" ) parser.set_defaults( filename_update=None, pattern_id="(.*).bed", tracks=None, ) # add common options (-h/--help, ...) and parse command line (options, args) = E.start(parser, argv=argv) if len(args) < 2: raise ValueError("at least two arguments required") if options.filename_update: infile = iotools.open_file(options.filename_update, "r") previous_results = {} for line in infile: if line.startswith("#"): continue if line.startswith("set1"): continue data = line[:-1].split("\t") set1, set2 = data[0], data[1] if set1 not in previous_results: previous_results[set1] = {} if set2 not in previous_results: previous_results[set2] = {} previous_results[set1][set2] = "\t".join(data[2:]) rev = [(data[x + 1], data[x]) for x in range(2, len(data), 2)] previous_results[set2][set1] = "\t".join(iotools.flatten(rev)) else: previous_results = {} pattern_id = re.compile(options.pattern_id) def getTitle(x): try: return pattern_id.search(x).groups()[0] except AttributeError: return x ncomputed, nupdated = 0, 0 if options.tracks: counter = CounterTracks(args[0]) options.stdout.write("set1\tset2\t%s\n" % counter.getHeader()) for filename in args[1:]: title1 = getTitle(filename) for title2 in counter.getTracks(): if previous_results: try: prev = previous_results[title1][title2] except KeyError: pass else: options.stdout.write("%s\t%s\t%s\n" % ((title1, title2, prev))) nupdated += 1 continue counter.count(filename, title2) options.stdout.write("%s\t%s\t%s\n" % ((title1, title2, str(counter)))) ncomputed += 1 else: counter = Counter() options.stdout.write("set1\tset2\t%s\n" % counter.getHeader()) for x in range(len(args)): title1 = getTitle(args[x]) for y in range(0, x): title2 = getTitle(args[y]) if previous_results: try: prev = previous_results[title1][title2] except KeyError: pass else: options.stdout.write("%s\t%s\t%s\n" % ((title1, title2, prev))) nupdated += 1 continue counter.count(args[x], args[y]) options.stdout.write("%s\t%s\t%s\n" % ((title1, title2, str(counter)))) ncomputed += 1 E.info("nupdated=%i, ncomputed=%i" % (nupdated, ncomputed)) E.stop()
def main(argv=None): if argv is None: argv = sys.argv parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument("-s", "--ignore-strand", dest="ignore_strand", action="store_true", help="ignore strand information.") parser.add_argument( "-u", "--update", dest="filename_update", type=str, help="if filename is given, previous results will be read" "from there and only changed sets will be computed ") parser.add_argument("-p", "--pattern-identifier", dest="pattern_id", type=str, help="pattern to convert a filename to an id") parser.add_argument("-g", "--output-only-genes", dest="output_only_genes", action="store_true", help="only output gene stats (includes gene lists)") parser.set_defaults( ignore_strand=False, filename_update=None, pattern_id="(.*).gtf", output_only_genes=False, ) (args, unknown) = E.start(parser, unknowns=True) if len(unknown) < 2: print(USAGE) raise ValueError("at least two arguments required") if args.filename_update: infile = iotools.open_file(args.filename_update, "r") previous_results = {} for line in infile: if line.startswith("#"): continue if line.startswith("set1"): continue data = line[:-1].split("\t") set1, set2 = data[0], data[1] if set1 not in previous_results: previous_results[set1] = {} if set2 not in previous_results: previous_results[set2] = {} previous_results[set1][set2] = "\t".join(data[2:]) rev = [(data[x + 1], data[x]) for x in range(2, len(data), 2)] previous_results[set2][set1] = "\t".join(iotools.flatten(rev)) else: previous_results = {} if args.output_only_genes: counter = CounterGenes() else: counter = Counter() args.stdout.write("set1\tset2\t%s\n" % counter.getHeader()) pattern_id = re.compile(args.pattern_id) def getTitle(x): try: return pattern_id.search(x).groups()[0] except AttributeError: return x ncomputed, nupdated = 0, 0 for x in range(len(unknown)): title1 = getTitle(unknown[x]) for y in range(0, x): title2 = getTitle(unknown[y]) if previous_results: try: prev = previous_results[title1][title2] except KeyError: pass else: args.stdout.write("%s\t%s\t%s\n" % ((title1, title2, prev))) nupdated += 1 continue counter.count(unknown[x], unknown[y]) args.stdout.write("%s\t%s\t%s\n" % ((title1, title2, str(counter)))) ncomputed += 1 E.info("nupdated=%i, ncomputed=%i" % (nupdated, ncomputed)) E.stop()
def loadNormalisedExpression(infiles, outfiles): for infile in iotools.flatten(infiles): outfile = P.snip(infile, ".tsv.gz") + ".load" P.load(infile, outfile)
def loadDifferentialExpression(infiles, outfiles): for infile in iotools.flatten(infiles): outfile = P.snip(infile, ".tsv") + ".load" P.load(infile, outfile)