def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("--Infile", dest="Infile", type=str, help="Supply file containing filtered 16S fasta file") parser.add_argument("--Outfile", dest="Outfile", type=str, help="Supply desired outfile name") # add common options (-h/--help, ...) and parse command line (args) = E.start(parser, argv=argv) ############################################### ############################################### ############## Execute Functions ############## ############################################### ############################################### specformatter(args.Infile, args.Outfile) # write footer and output benchmark information. E.stop()
def main(argv=None): parser = E.ArgumentParser(descriptin=__doc__) parser.add_argument("-f", "--fasta", dest="input_filename_fasta", type=str, help="filename with fasta sequences. ") parser.add_argument("-o", "--output-filename-sequences", dest="output_filename_sequences", type=str, help="output per sequence information to filename") parser.set_defaults(input_filename_fasta=None, ) (args, unknown) = E.start(parser, argv=argv, unknowns=True) if len(unnowns) > 0: args.input_filename_fasta = args[0] sequence_pairs = [] if args.input_filename_fasta != "-" and os.path.exists( args.input_filename_fasta + ".fai"): has_index = 1 fastafile = pysam.FastaFile(args.input_filename_fasta) sequence_pairs = list(zip(fastafile.references, fastafile.lengths)) else: has_index = 0 iterator = pysam.FastxFile(args.input_filename_fasta) for record in iterator: sequence_pairs.append((record.name, len(record.sequence))) lengths = numpy.array([x[1] for x in sequence_pairs]) args.stdout.write("\t".join(("has_index", "nsequences", "total_length", "min_length", "max_length", "median_length", "mean_length")) + "\n") if len(lengths) > 0: args.stdout.write("\t".join( map(str, (has_index, len(sequence_pairs), lengths.sum(), lengths.min(), lengths.max(), numpy.median(lengths), lengths.mean()))) + "\n") else: args.stdout.write("\t".join( map(str, (has_index, len(sequence_pairs), 0, "", "", "", ""))) + "\n") if args.output_filename_sequences: with iotools.open_file(args.output_filename_sequences, "w") as outf: outf.write("name\tlength\n") outf.write( "\n".join(["\t".join(map(str, x)) for x in sequence_pairs]) + "\n") E.stop()
def main(argv=None): parser = E.ArgumentParser(description=__doc__) parser.add_argument( "--regex-filename", dest="regex_filename", type=str, help="extract column name from filename via regular expression ") parser.add_argument("--filter", dest="filters", type=str, action="append", choices=("PASS", "SNP"), help="apply filters to VCFs when reading ") parser.set_defaults( regex_filename=None, filters=[], ) (args, unknown) = E.start(parser, argv=argv, add_output_options=True, unknowns=True) if len(unknown) < 2: raise ValueError("requiring at least 2 input filenames") dfs = [] for filename in unknown: if args.regex_filename: try: name = re.search(args.regex_filename, filename).groups()[0] except AttributeError: raise ValueError( "regular expression '{}' does not match {}".format( args.regex_filename, filename)) else: name = iotools.snip(os.path.basename(filename), ".vcf.gz") E.debug("reading data from {}".format(filename)) df = read_vcf_positions_into_dataframe(filename, filters=args.filters) df[name] = 1 dfs.append(df) ndata = len(dfs) merged_df = dfs[0] for df in dfs[1:]: merged_df = merged_df.merge(df, how="outer") merged_df = merged_df.fillna(0) ddf = merged_df.drop(["chrom", "pos"], axis=1) set_counts = ddf.groupby(by=list(ddf.columns)).size() set_counts = set_counts.reset_index() set_counts.columns = list(set_counts.columns[:-1]) + ["counts"] set_counts.to_csv(args.stdout, sep="\t", index=False) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument("-o", "--output-section", dest="output", type=str, choices=("full", "name"), help="output either ``full`` overlapping entries, only the ``name``s.") parser.set_defaults( output="full", ) # add common options (-h/--help, ...) and parse command line (args, unknown) = E.start(parser, argv=argv, unknowns=True) if len(unknown) != 2: raise ValueError("two arguments required") if unknown[0] == "-": infile1 = args.stdin else: infile1 = iotools.open_file(unknown[0], "r") infile2 = iotools.open_file(unknown[1], "r") idx = Bed.readAndIndex(infile2, with_values=True) output = args.output outfile = args.stdout if output == "name": outfile.write("name1\tname2\n") outf = lambda x: x.fields[0] else: outf = str for bed in Bed.iterator(infile1): try: overlaps = idx[bed.contig].find(bed.start, bed.end) except (KeyError, IndexError): # ignore missing contig and zero length intervals continue for o in overlaps: outfile.write("\t".join((outf(bed), outf(o[2]))) + "\n") E.stop()
def main(argv=sys.argv): parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument("--is-gtf", dest="is_gtf", action="store_true", help="input is gtf.") parser.set_defaults( is_gtf=False, ) (args, unknown) = E.start(parser, add_output_options=True, unknowns=True) if len(unknown) == 0: files = [args.stdin] else: files = args args.stdout.write("track\t%s" % ("\t".join(counter_gff.fields))) if args.is_gtf: args.stdout.write("\t%s" % ("\t".join(counter_exons.fields))) args.stdout.write("\n") for f in files: if f == args.stdin: infile = f args.stdout.write("stdin") else: infile = iotools.open_file(f) args.stdout.write(f) counters = [] if args.is_gtf: iterator = GTF.iterator(infile) counters.append(counter_gff(iterator)) counters.append(counter_exons(counters[0])) else: iterator = GTF.iterator(infile) counters.append(counter_gff(iterator)) c = counters[-1] for x in c: pass for c in counters: args.stdout.write("\t%s" % str(c)) args.stdout.write("\n") if infile != sys.stdin: infile.close() E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument( "-a", "--first-fastq-file", dest="fastq1", type=str, help="supply read1 fastq file") parser.add_argument( "-b", "--second-fastq-file", dest="fastq2", type=str, help="supply read2 fastq file") # add common options (-h/--help, ...) and parse command line (args, unknown) = E.start(parser, argv=argv, unknowns=True) if unknown and len(unknown) == 2: args.fastq1, args.fastq2 = unknown fastq1 = iotools.open_file(args.fastq1) fastq2 = iotools.open_file(args.fastq2) E.info("iterating over fastq files") f1_count = 0 for f1, f2 in zip_longest(Fastq.iterate(fastq1), Fastq.iterate(fastq2)): if not (f1 and f2) or (not f2 and f1): try: raise PairedReadError( "unpaired reads detected. Are files sorted? are " "files of equal length?") except PairedReadError as e: raise PairedReadError(e).with_traceback(sys.exc_info()[2]) else: assert f1.identifier.endswith("/1") and \ f2.identifier.endswith("/2"), \ "Reads in file 1 must end with /1 and reads in file 2 with /2" args.stdout.write( ">%s\n%s\n>%s\n%s\n" % (f1.identifier, f1.seq, f2.identifier, f2.seq)) f1_count += 1 E.info("output: %i pairs" % f1_count) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("-i", "--test-option", dest="test_option", type=str, help="test option") parser.set_defaults(test_option="test") # add common options (-h/--help, ...) and parse command line (args) = E.start(parser, argv=argv) files = glob.glob(os.path.join(os.path.dirname(__file__), "*.pyx")) # do sth ninput, nskipped, noutput = 0, 0, 0 for f in files: E.info("rebuilding %s" % f) ninput += 1 prefix, suffix = os.path.splitext(f) for ext in (".c", ".pyxbldc"): try: os.remove(prefix + ext) except OSError: pass dirname, basename = os.path.split(prefix) assert basename.startswith("_") scriptname = os.path.join(dirname, basename[1:]) + ".py" if not os.path.exists(scriptname): E.warn("script %s does not exist - skipped" % scriptname) nskipped += 1 continue E.info("compiling %s" % scriptname) os.system("%s %s --help > /dev/null" % (sys.executable, scriptname)) noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("-o", "--outdir", dest="outdir", type=str, help="supply output directory") parser.add_argument("-p", "--prefix", dest="prefix", type=str, help="supply output file prefix") # add common options (-h/--help, ...) and parse command line (args) = E.start(parser, argv=argv) prefix = os.path.join(args.outdir, args.prefix) d_outf = open(prefix + "_domain.tsv", "w") k_outf = open(prefix + "_kingdom.tsv", "w") p_outf = open(prefix + "_phylum.tsv", "w") c_outf = open(prefix + "_class.tsv", "w") o_outf = open(prefix + "_order.tsv", "w") f_outf = open(prefix + "_family.tsv", "w") g_outf = open(prefix + "_genus.tsv", "w") s_outf = open(prefix + "_species.tsv", "w") for line in args.stdin.readlines(): data = line[:-1].split("\t") taxon = data[0] counts = data[1:] taxonomy = taxon.split("|") if "d__" in taxonomy[-1]: print(taxon, counts) d_outf.write("\t".join([taxon] + counts) + "\n") elif "k__" in taxonomy[-1]: k_outf.write("\t".join([taxon] + counts) + "\n") elif "p__" in taxonomy[-1]: p_outf.write("\t".join([taxon] + counts) + "\n") elif "c__" in taxonomy[-1]: c_outf.write("\t".join([taxon] + counts) + "\n") elif "o__" in taxonomy[-1]: o_outf.write("\t".join([taxon] + counts) + "\n") elif "f__" in taxonomy[-1]: f_outf.write("\t".join([taxon] + counts) + "\n") elif "g__" in taxonomy[-1]: g_outf.write("\t".join([taxon] + counts) + "\n") elif "s__" in taxonomy[-1]: s_outf.write("\t".join([taxon] + counts) + "\n") # write footer and output benchmark information. E.stop()
def main(argv=None): parser = E.ArgumentParser(description=__doc__) parser.add_argument("-d", "--delimiter", dest="delimiter", type=str, help="delimiter to separate columns ") parser.add_argument("-m", "--method", dest="methods", type=str, action="append", choices=["row-describe", "column-describe"], help="additional methods to apply ") parser.set_defaults( delimiter="\t", methods=[], ) (args) = E.start(parser, argv=argv, add_output_options=True) if not args.methods: args.methods = ["summary"] table = pandas.read_csv(args.stdin, args.delimiter) args.stdout.write("metric\tcount\tpercent\tinfo\n") for method in args.methods: label = re.sub("-", "_", method) if method == "summary": for category, count, denominator, info in compute_table_summary( table): args.stdout.write("\t".join( map(str, (category, count, iotools.pretty_percent(count, denominator, na=""), info))) + "\n") elif method == "column-describe": df = table.describe().T.stack() with E.open_output_file(label) as outf: outf.write("label\tcategory\tvalue\n") df.to_csv(outf, sep="\t") elif method == "row-describe": df = table.T.describe().stack() with E.open_output_file(label) as outf: outf.write("label\tcategory\tvalue\n") df.to_csv(outf, sep="\t") E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv parser = E.ArgumentParser() parser.add_argument("-p", "--arguments", type=str, dest="arguments", default="", help="Pass options and arguments to the executable. Please surround options in \"\"") parser.add_argument("-o", "--output-dir", type=str, dest="output", default=".", help="Output for the fastq files.") parser.add_argument("-f", "--fastqc", dest="fastqc", action="store_true", help="After demultiplexing open the fastq files in FastQC.") parser.add_argument("-F", "--fastqc-options", type=str, dest="fastqc_options", default="", help="Options for FastQC. Please surround options in \"\"") parser.add_argument("-H", "--bcl2fastq-help", dest="bcl2fastq_help", action="store_true", help="Print help for Illumina's bcl2fastq conversion software") (args) = E.start(parser) if subprocess.run("which bcl2fastq", shell=True).returncode: raise ValueError("bcl2fastq cannot be found") if args.bcl2fastq_help: subprocess.run("bcl2fastq --help", shell=True) return else: subprocess.run(f"bcl2fastq {args.arguments} -o {args.output}", shell=True) for infile in glob.glob(f"{args.output}/**/*.fastq.gz", recursive=True): with gzip.GzipFile(f"{infile}", "r") as f: if sum(1 for char in f.read().decode('utf-8') if char == "\n") % 4 != 0: raise ValueError(f"{infile} is either corrupt or incomplete.") if args.fastqc: for infile in glob.glob(f"{args.output}/**/*.fastq.gz", recursive=True): subprocess.run(f"fastqc {infile} {args.fastqc_options}", shell=True)
def main(argv=sys.argv): parser = E.ArgumentParser(description=__doc__) parser.add_argument("-i", "--input-fastq", dest="input_fastq_file", type=str, help="input fastq file") parser.add_argument("-m", "--method", dest="method", type=str, choices=["ont2pacbio"], help="methods to apply ") parser.set_defaults( input_fastq_file=None, line_width=80, method=None, ) (args, unknown) = E.start(parser, argv, add_output_options=True, unknowns=True) if len(unknown) == 1: args.input_fastq_file = unknown[0] if args.input_fastq_file == "-": args.input_fastq_file = args.stdin outf = args.stdout line_width = args.line_width well_no = 0 for record in pysam.FastqFile(args.input_fastq_file): well_no += 1 quals = record.get_quality_array() seq = record.sequence qv = int(math.floor(sum(quals) / len(quals))) outf.write(">{}/{}/{}_{} RQ=0.{}\n".format("test", well_no, 1, len(seq) + 1, qv)) for x in range(0, len(seq), line_width): outf.write(seq[x:x + line_width] + "\n") E.stop()
def main(argv=sys.argv): parser = E.ArgumentParser(description=__doc__) parser.add_argument("-i", "--input-fastq-file", dest="input_fastq_file", type=str, help="input fastq file. ") parser.add_argument("-m", "--method", dest="methods", action="append", type=str, choices=("length", ), help="methods to apply ") parser.set_defaults( methods=[], input_fastq_file=None, ) (args, unknown) = E.start(parser, argv, unknowns=True) if len(unknown) == 1: args.input_fastq_file = unknown[0] if args.input_fastq_file is None: raise ValueError("missing input fastq file") counter = E.Counter() # note: complete rewrite with Counters, currently only length if args.methods != ["length"]: raise NotImplementedError() with pysam.FastqFile(args.input_fastq_file) as inf: for read in inf: counter.input += 1 args.stdout.write( "\t".join(map(str, (read.name, len(read.sequence)))) + "\n") counter.output += 1 E.info(counter) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.set_defaults() # add common options (-h/--help, ...) and parse command line (args, unknown) = E.start(parser, argv=argv, unknowns=True) if len(unknown) == 0 or (len(unknown) == 1 and unknown[0] == "-"): infile = args.stdin else: infile = fileinput.FileInput(args) # do sth ninput, nskipped, noutput = 0, 0, 0 header = False for line in infile: ninput += 1 if line.startswith("#"): pass elif not header: header = line elif line == header: nskipped += 1 continue args.stdout.write(line) noutput += 1 E.info("ninput=%i, noutput=%i, nskipped=%i" % (ninput, noutput, nskipped)) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) # add common options (-h/--help, ...) and parse command line (args) = E.start(parser, argv=argv) infile = argv[-1] for record in makeSplicedFasta(infile): options.stdout.write(record) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("-k", "--keep-header", dest="keep_header", type=int, help="randomize, but keep header in place ") parser.set_defaults(keep_header=0) # add common options (-h/--help, ...) and parse command line (args) = E.start(parser, argv=argv) inf = args.stdin outf = args.stdout c = E.Counter() for x in range(args.keep_header): c.header += 1 outf.write(inf.readline()) lines = inf.readlines() c.lines_input = len(lines) random.shuffle(lines) for line in lines: outf.write(line) c.lines_output = len(lines) E.info(c) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument( "-s", "--method=sort --sort-order", dest="sort", type=str, help="fields to take (in sorted order).") (args) = E.start(parser, add_csv_options=True) reader = csv.DictReader(E.stdin, dialect=args.csv_dialect) if args.sort: fields = args.sort.split(",") else: fields = None writer = csv.DictWriter(E.stdout, fields, dialect=args.csv_dialect, lineterminator=args.csv_lineterminator, extrasaction='ignore') E.stdout.write("\t".join(fields) + "\n") for row in reader: row = iotools.convertDictionary(row) writer.writerow(row) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("-t", "--test", dest="test", type=str, help="supply help") # add common options (-h/--help, ...) and parse command line (args) = E.start(parser, argv=argv) # write footer and output benchmark information. E.stop()
def main(): parser = E.ArgumentParser() (options, args) = E.start(parser, unknowns=True) bamfile = pysam.AlignmentFile(args[0]) outbam = pysam.AlignmentFile(args[1], "wb", template=bamfile) chunks = 0 for read1s, read2s in chunk_bam_by_readname(bamfile): chunks += 1 if chunks % 1000000 == 0: E.info("Done %s fragments" % chunks) for contig in read1s: for read in read1s[contig]: read2 = find_read2(read2s, read) if read2 is not None: outbam.write(read) outbam.write(read2) outbam.close() E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("--task", dest="task", type=str, choices=["extract_table", "get_coverage", "clean_table"], help="task to perform") parser.add_argument("-t", "--table-name", dest="table", type=str, help="table in SQLite DB to extract") # add common options (-h/--help, ...) and parse command line (args) = E.start(parser, argv=argv, add_database_options=True) if args.task == "extract_table": out_df = getTableFromDb(args.database_url, args.table) elif args.task == "get_coverage": out_df = getModelCoverage(args.database_url, table_regex="(\S+)_transcript_counts") elif args.task == "clean_table": infile = argv[-1] out_df = cleanStatsTable(infile) out_df.to_csv(args.stdout, sep="\t", index_label="track") # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("--Mapping-file", dest="mappingfile", type=str, help= "Supply mapping file in tsv format filename \t taxonomy") parser.add_argument("--Outfile", dest="outfile", type=str, help= "Desired outfile name") # add common options (-h/--help, ...) and parse command line (args) = E.start(parser, argv=argv) ############################################### ############################################### ############## Execute Functions ############## ############################################### ############################################### file2tax = FileMap(args.mappingfile) outfile = OutFile(args.outfile) for fastafile, taxonomy in file2tax.items(): RenameFastaTitle(fastafile, file2tax, outfile) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.ArgumentParser(description=__doc__) parser.add_argument("-r", "--remove", dest="remove", action="store_true", help="remove specified columns, keep all others.") parser.add_argument("-u", "--unique", dest="unique", action="store_true", help="output rows are uniq.") parser.add_argument( "-l", "--large", dest="large", action="store_true", help="large columns. Do not use native python csv module.") parser.add_argument("-f", "--filename-fields", dest="filename_fields", type=str, help="filename with field information.") parser.set_defaults( remove=False, unique=False, large=False, filename_fields=None, ) (args, unknown) = E.start(parser, add_csv_options=True, quiet=True, unknowns=True) input_fields = unknown if args.filename_fields: input_fields = [ x[:-1].split("\t")[0] for x in [ x for x in iotools.open_file(args.filename_fields, "r").readlines() if x[0] != "#" ] ] if args.unique: outfile = UniqueBuffer(args.stdout) else: outfile = args.stdout while 1: line = args.stdin.readline() if not line: E.stop() sys.exit(0) if line[0] == "#": continue first_line = line break old_fields = first_line[:-1].split("\t") fields = [] for f in input_fields: # do pattern search if f[0] == "%" and f[-1] == "%": pattern = re.compile(f[1:-1]) for o in old_fields: if pattern.search(o) and o not in fields: fields.append(o) else: if f in old_fields: fields.append(f) if args.remove: fields = set(fields) fields = [x for x in old_fields if x not in fields] if args.large: reader = DictReaderLarge(CommentStripper(args.stdin), fieldnames=old_fields, dialect=args.csv_dialect) else: reader = csv.DictReader(CommentStripper(args.stdin), fieldnames=old_fields, dialect=args.csv_dialect) writer = csv.DictWriter(outfile, fields, dialect=args.csv_dialect, lineterminator=args.csv_lineterminator, extrasaction='ignore') print("\t".join(fields)) first_row = True ninput, noutput, nerrors = 0, 0, 0 while 1: ninput += 1 try: row = six.next(reader) except _csv.Error as msg: args.stderr.write("# error while parsing: %s\n" % (msg)) nerrors += 1 continue except StopIteration: break if not row: break writer.writerow(row) noutput += 1 E.info("ninput=%i, noutput=%i, nerrors=%i" % (ninput, noutput, nerrors)) E.stop()
def main(argv=None): parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument( "--input-filename-fasta", dest="input_filename_fasta", type=str, help="filename with reference sequence in fasta format ") parser.add_argument( "--input-filename-bam", dest="input_filename_bam", type=str, help="filename with aligned reads ") parser.add_argument( "--method", dest="methods", type=str, action="append", choices=["add-strelka-genotype", "lift-over"], help="methods to apply ") parser.add_argument( "--input-filename-chain", dest="input_filename_chain", type=str, help="filename with alignment chain for lift-over ") parser.add_argument( "--normal-sample-regex", dest="normal_sample_regex", type=str, help="regular expression to apply to header to identify normal " "sample id ") parser.add_argument( "--output-filename-unmapped", dest="output_filename_unmapped", type=str, help="filename with variants that could not be lifted over ") parser.set_defaults( input_filename_fasta=None, input_filename_bam=None, input_filename_vcf="-", sample_size=0.001, region_size=20, methods=[], normal_sample_regex=None, input_filename_chain=None, output_filename_unmapped=None, ) (args, unknown) = E.start(parser, argv=argv, add_output_options=True, unknowns=True) if len(unknown) > 0: args.input_filename_vcf = unknown[0] vcf_in = pysam.VariantFile(args.input_filename_vcf) if "lift-over" in args.methods: if args.input_filename_chain is None: raise ValueError("--method=lift-over requires --input-filename-chain") if not os.path.exists(args.input_filename_chain): raise OSError("file {} with chain data does not exist".format( args.input_filename_chain)) E.info("reading chain from {}".format(args.input_filename_chain)) with iotools.open_file(args.input_filename_chain) as inf: map_chain, map_contig2length = read_liftover_chain(inf) if args.input_filename_fasta: fasta = pysam.FastaFile(args.input_filename_fasta) else: fasta = None if args.input_filename_bam: bam = pysam.AlignmentFile(args.input_filename_bam) else: bam = None outf = args.stdout c = E.Counter() if "add-strelka-genotype" in args.methods: map_nt2gt = {"ref": "0/0", "het": "0/1", "hom": "1/1", "conflict": "."} map_tumour2gt = {"ref": "0/0", "het": "0/1", "hom": "1/1"} header = str(vcf_in.header).splitlines() header.insert( len(header) - 1, '##FORMAT=<ID=GT,Number=1,Type=String,Description=' '"Genotypes of reference and alternative alleles, ' 'added by cgatcore vcf2vcf.">') header = "\n".join(header) if args.normal_sample_regex: normal_sample = re.search(" -bam-file \S+/([^/]+)_S\d+.bam", header).groups()[0] else: normal_sample = "NORMAL" is_first = True for record in vcf_in: c.input += 1 if "GT" in record.format: if is_first: outf.write(header + "\n") is_first = False outf.write(str(record)) c.has_gt += 1 continue gt_normal = map_nt2gt[record.info["NT"]] gt_tumour = record.info["SGT"] norm, tumour = gt_tumour.split("->") if gt_tumour[0] in "ACGT": alts = record.alts if alts is None: c.no_alt += 1 continue if len(record.alts) > 1: c.multi_allelic += 1 continue _map_tumour2gt = { record.alts[0]: "1", record.ref: "0"} try: gt_tumour = "/".join( sorted([_map_tumour2gt[x] for x in tumour])) except KeyError: gt_tumour = "." c.ambigous_genotype += 1 else: gt_tumour = map_tumour2gt[tumour] fields = str(record)[:-1].split("\t") # FORMAT fields[8] = ":".join(("GT", fields[8])) # SAMPLES # makes a few assumptions, fix! header_insert_normal = False if len(fields) == 11: fields[9] = ":".join((gt_normal, fields[9])) fields[10] = ":".join((gt_tumour, fields[10])) elif len(fields) == 10: header_insert_normal = True values = fields[9].split(":") fields.append(":".join((gt_tumour, fields[9]))) fields[9] = ":".join([gt_normal] + ["."] * len(values)) else: raise NotImplementedError() if is_first: if not header_insert_normal: outf.write(header + "\n") else: header = re.sub(r"\tFORMAT\t", "\tFORMAT\t%s\t" % normal_sample, header) outf.write(header + "\n") is_first = False outf.write("\t".join(fields) + "\n") c.output += 1 elif "lift-over" in args.methods: header = str(vcf_in.header).splitlines() if fasta: # validate contig size expected_lengths = dict(list(zip(fasta.references, fasta.lengths))) else: expected_lengths = map_contig2length # update contig names and sizes in VCF header header = [x for x in header if not x.startswith("##contig")] header[-1:-1] = ["##contig=<ID={},length={}>".format( contig, length) for contig, length in sorted(expected_lengths.items())] header.insert( len(header) - 1, '##liftover=<CHAIN={},REFERENCE={}>'.format( args.input_filename_chain, args.input_filename_fasta)) outf.write("\n".join(header) + "\n") unmapped_contigs = set() unknown_contigs = set() trans_genotypes = str.maketrans("01", "10") if fasta: # validate contig size expected_lengths = dict(list(zip(fasta.references, fasta.lengths))) for contig, length in list(map_contig2length.items()): if contig in expected_lengths: if length != expected_lengths[contig]: raise ValueError( "contig lengths mismatch. For contig {} chain files " "says {}, but fasta files says {}".format( contig, length, expected_lengths[contig])) E.info("contig sizes in chain file and fasta files correspond.") if args.output_filename_unmapped: outfile_unmapped = iotools.open_file(args.output_filename_unmapped, "w") outfile_unmapped.write("\n".join(header) + "\n") else: outfile_unmapped = None for record in vcf_in: c.input += 1 try: mm = map_chain[record.contig] except KeyError: c.skipped_unmapped_contig += 1 unmapped_contigs.add(record.contig) if outfile_unmapped: outfile_unmapped.write("skipped_unmapped_contig\t{}".format(str(record))) continue try: m = mm.search(record.start, record.stop) except AttributeError: c.skipped_mapping_error += 1 if outfile_unmapped: outfile_unmapped.write("skipped_mapping_error\t{}".format(str(record))) continue if len(m) == 0: c.skipped_unmapped_position += 1 if outfile_unmapped: outfile_unmapped.write("skipped_unmapped_position\t{}".format(str(record))) continue elif len(m) > 1: c.skipped_multimapping_position += 1 if outfile_unmapped: outfile_unmapped.write("skipped_multimapping_position\t{}".format(str(record))) continue m = m[0] y_contig, y_start, y_end, y_invert = m.data if y_invert: y_pos = y_end - (record.start - m.start) else: y_pos = (record.start - m.start) + y_start if fasta: try: ref_base = fasta.fetch(y_contig, y_pos, y_pos + len(record.ref)).upper() except KeyError: c.skipped_unknown_contig += 1 unknown_contigs.add(y_contig) ref_base = None continue swap_alleles = False if ref_base: error = False if ref_base == record.ref: c.matches += 1 else: if len(record.alts) == 1: alt_base = record.alts[0] if ref_base == alt_base: swap_alleles = True c.allele_swap_variant += 1 else: c.error_mismatch_variant += 1 error = "mismatch" else: error = "multi-mismatch" c.error_multi_mismatch_variant += 1 if error: if outfile_unmapped: outfile_unmapped.write("{}\t{}".format(error, str(record))) c.skipped_error_variant += 1 continue fields = str(record)[:-1].split("\t") fields[0] = y_contig fields[1] = str(y_pos) if swap_alleles: fields[4] = alt_base fields[5] = ref_base # update genotype fields keep = False for idx in range(9, len(fields)): gt, rest = fields[idx].split(":", 1) keep = keep or "0" in gt fields[idx] = ":".join((gt.translate(trans_genotypes), rest)) # remove reference only calls if not keep: if outfile_unmapped: outfile_unmapped.write("reference_call\t{}".format(str(record))) c.skipped_allele_swap_reference += 1 continue c.output += 1 outf.write("\t".join(fields) + "\n") c.unmapped_contigs = len(unmapped_contigs) c.unknown_contigs = len(unknown_contigs) E.info(c.asTable()) if unknown_contigs: E.info("unknown contigs: {}".format(",".join(sorted(unknown_contigs)))) if unmapped_contigs: E.info("unmapped contigs: {}".format(",".join(sorted(unmapped_contigs)))) E.stop()
def main(argv=sys.argv): # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("-s", "--session", dest="session", type=str, help="load session before creating plots ") parser.add_argument("-d", "--snapshot-dir", dest="snapshotdir", type=str, help="directory to save snapshots in ") parser.add_argument("-f", "--format", dest="format", type=str, choices=("png", "eps", "svg"), help="output file format ") parser.add_argument("-o", "--host", dest="host", type=str, help="host that IGV is running on ") parser.add_argument("-p", "--port", dest="port", type=int, help="port that IGV listens at ") parser.add_argument("-e", "--extend", dest="extend", type=int, help="extend each interval by a number of bases ") parser.add_argument("-x", "--expand", dest="expand", type=float, help="expand each region by a certain factor ") parser.add_argument("--session-only", dest="session_only", action="store_true", help="plot session after opening, " "ignore intervals ") parser.add_argument("-n", "--name", dest="name", type=str, choices=("bed-name", "increment"), help="name to use for snapshot ") parser.set_defaults( command="igv.sh", host='127.0.0.1', port=61111, snapshotdir=os.getcwd(), extend=0, format="png", expand=1.0, session=None, session_only=False, keep_open=False, name="bed-name", ) # add common options (-h/--help, ...) and parse command line (args) = E.start(parser, argv=argv, add_output_options=True) igv_process = None if args.new_instance: E.info("starting new IGV process") igv_process = IGV.startIGV(command=args.command, port=args.port) E.info("new IGV process started") E.info("connection to process on %s:%s" % (args.host, args.port)) E.info("saving images in %s" % args.snapshotdir) igv = IGV(host=args.host, port=args.port, snapshot_dir=os.path.abspath(args.snapshotdir)) if args.session: E.info('loading session from %s' % args.session) igv.load(args.session) E.info('loaded session') if args.session_only: E.info('plotting session only ignoring any intervals') fn = "%s.%s" % (os.path.basename(args.session), args.format) E.info("writing snapshot to '%s'" % os.path.join(args.snapshotdir, fn)) igv.save(fn) else: c = E.Counter() for bed in pysam.tabix_iterator(args.stdin, parser=pysam.asBed()): c.input += 1 # IGV can not deal with white-space in filenames if args.name == "bed-name": name = re.sub("\s", "_", bed.name) elif args.name == "increment": name = str(c.input) E.info("going to %s:%i-%i for %s" % (bed.contig, bed.start, bed.end, name)) start, end = bed.start, bed.end extend = args.extend if args.expand: d = end - start extend = max(extend, (args.expand * d - d) // 2) start -= extend end += extend igv.go("%s:%i-%i" % (bed.contig, start, end)) fn = E.get_output_file("%s.%s" % (name, args.format)) E.info("writing snapshot to '%s'" % fn) igv.save(fn) c.snapshots += 1 E.info(c) if igv_process is not None and not args.keep_open: E.info('shutting down IGV') igv_process.send_signal(signal.SIGKILL) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument("-t", "--template-bam-file", dest="filename_genome_bam", type=str, help="input bam file for header information ") parser.add_argument("-s", "--contigs-tsv-file", dest="filename_contigs", type=str, help="filename with contig sizes ") parser.add_argument( "-o", "--colour", dest="colour_mismatches", action="store_true", help="mismatches will use colour differences (CM tag) ") parser.add_argument("-i", "--ignore-mismatches", dest="ignore_mismatches", action="store_true", help="ignore mismatches ") parser.add_argument("-c", "--remove-contigs", dest="remove_contigs", type=str, help="','-separated list of contigs to remove ") parser.add_argument("-f", "--force-output", dest="force", action="store_true", help="force overwriting of existing files ") parser.add_argument("-u", "--unique", dest="unique", action="store_true", help="remove reads not matching uniquely ") parser.set_defaults( filename_genome_bam=None, filename_gtf=None, filename_mismapped=None, remove_contigs=None, force=False, unique=False, colour_mismatches=False, ignore_mismatches=False, ) # add common options (-h/--help, ...) and parse command line (args) = E.start(parser, argv=argv) genomefile, referencenames, referencelengths = None, None, None if args.filename_genome_bam: genomefile = pysam.AlignmentFile(args.filename_genome_bam, "rb") elif args.filename_contigs: contigs = iotools.ReadMap(iotools.open_file(args.filename_contigs)) data = list(zip(*list(contigs.items()))) referencenames, referencelengths = data[0], list(map(int, data[1])) else: raise ValueError( "please provide either --template-bam-file or --contigs-tsv-file") infile = pysam.AlignmentFile("-", "rb") outfile = pysam.AlignmentFile("-", "wb", template=genomefile, referencenames=referencenames, referencelengths=referencelengths) if args.colour_mismatches: tag = "CM" else: tag = "NM" nambiguous = 0 ninput = 0 nunmapped = 0 ncigar = 0 nfull = 0 noutput = 0 contig2tid = dict([(y, x) for x, y in enumerate(outfile.references)]) for qname, readgroup in itertools.groupby(infile, lambda x: x.qname): ninput += 1 reads = list(readgroup) if reads[0].is_unmapped: nunmapped += 1 continue # filter for best match best = min([x.opt(tag) for x in reads]) reads = [x for x in reads if x.opt(tag) == best] if len(reads) > 1: nambiguous += 1 continue read = reads[0] # reject complicated matches (indels, etc) # to simplify calculations below. if len(read.cigar) > 1: ncigar += 1 continue # set NH flag to latest count t = dict(read.tags) t['NH'] = 1 read.tags = list(t.items()) sname = infile.getrname(read.tid) contig, first_exon_start, middle, last_exon_end, splice, strand = sname.split( "|") first_exon_end, last_exon_start = middle.split("-") first_exon_start, first_exon_end, last_exon_start, last_exon_end = list( map(int, (first_exon_start, first_exon_end, last_exon_start, last_exon_end))) first_exon_end += 1 total = first_exon_end - first_exon_start + \ last_exon_end - last_exon_start first_exon_length = first_exon_end - first_exon_start match1 = first_exon_length - read.pos intron_length = last_exon_start - first_exon_end match2 = read.qlen - match1 # match lies fully in one exon - ignore if match1 <= 0 or match2 <= 0: nfull += 1 continue # increment pos read.pos = first_exon_start + read.pos read.tid = contig2tid[contig] # 3 = BAM_CREF_SKIP read.cigar = [(0, match1), (3, intron_length), (0, match2)] outfile.write(read) noutput += 1 outfile.close() if genomefile: genomefile.close() c = E.Counter() c.input = ninput c.output = noutput c.full = nfull c.cigar = ncigar c.ambiguous = nambiguous c.unmapped = nunmapped E.info("%s" % str(c)) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument("-g", "--genome-file", dest="genome_file", type=str, help="filename with genome") parser.add_argument("-f", "--features", dest="features", type=str, action="append", help="features to collect ") parser.add_argument("-w", "--window-size", dest="window_size", type=int, help="window size in bp for histogram computation. " "Determines the bin size. ") parser.add_argument("-b", "--num-bins", dest="num_bins", type=int, help="number of bins for histogram computation " "if window size is not given. ") parser.add_argument("-m", "--method", dest="method", type=str, choices=("genomic", "histogram", ), help="methods to apply. ") parser.set_defaults( genome_file=None, window_size=None, num_bins=1000, value_format="%6.4f", features=[], method="genomic", ) (args) = E.start(parser, add_output_options=True) if args.genome_file: fasta = IndexedFasta.IndexedFasta(args.genome_file) else: fasta = None if args.method == "histogram": gff = GTF.readFromFile(args.stdin) gff.sort(key=lambda x: (x.contig, x.start)) chunk = [] last_contig = None for entry in gff: if last_contig != entry.contig: processChunk(last_contig, chunk, args, fasta) last_contig = entry.contig chunk = [] chunk.append(entry) processChunk(last_contig, chunk, args, fasta) elif args.method == "genomic": intervals = collections.defaultdict(int) bases = collections.defaultdict(int) total = 0 for entry in GTF.iterator(args.stdin): intervals[(entry.contig, entry.source, entry.feature)] += 1 bases[(entry.contig, entry.source, entry.feature) ] += entry.end - entry.start total += entry.end - entry.start args.stdout.write("contig\tsource\tfeature\tintervals\tbases") if fasta: args.stdout.write( "\tpercent_coverage\ttotal_percent_coverage\n") else: args.stdout.write("\n") total_genome_size = sum( fasta.getContigSizes(with_synonyms=False).values()) for key in sorted(intervals.keys()): nbases = bases[key] nintervals = intervals[key] contig, source, feature = key args.stdout.write("\t".join(("\t".join(key), str(nintervals), str(nbases)))) if fasta: args.stdout.write( "\t%f" % (100.0 * float(nbases) / fasta.getLength(contig))) args.stdout.write( "\t%f\n" % (100.0 * float(nbases) / total_genome_size)) else: args.stdout.write("\n") E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("-m", "--method", dest="method", type=str, action="store", choices=("hierarchy", "set-field", "set-pattern", "set-none"), help="Method to use for conversion") parser.add_argument("-g", "--gene-type", dest="gene_type", type=str, help="feature type to get gene_id from if possible ") parser.add_argument( "-t", "--transcript-type", dest="transcript_type", type=str, help="feature type to get transcript_id from if possible ") parser.add_argument( "-d", "--no-discard", dest="discard", action="store_false", help= "Do not discard feature types specified by GENE_TYPE and TRANSCRIPT_TYPE" ) parser.add_argument("--gene-id", dest="gene_field_or_pattern", type=str, help="Either field or pattern for the gene_id ") parser.add_argument("--transcript-id", dest="transcript_field_or_pattern", type=str, help="Either field or pattern for the transcript_id ") parser.add_argument( "--parent-field", dest="parent", type=str, help="field that specifies the parent relationship. Currently only" "if left as Parent will features with multiple parents be parsed" "correctly" "") parser.add_argument( "--read-twice", dest="read_twice", action="store_true", help= "Instead of holding the whole file in memory, read once for parsing the " "hierarchy, and then again for actaully doing the conversion. Means a real file " "and not a pipe must be provided." "") parser.add_argument( "--by-chrom", dest="by_chrom", action="store_true", help="Parse input file one choromosome at a time. Reduces memory usage, " "but input must be sorted by chromosome and features may not split accross " " multiple chromosomes" "") parser.add_argument( "--fail-missing-gene", dest="missing_gene", action="store_false", help="Fail if no feature of type GENE_TYPE is found instead of using " "defaulting to highest object in hierarchy" "") parser.set_defaults(method="hierarchy", gene_type="gene", transcript_type="mRNA", discard=True, gene_field_or_pattern="ID", transcript_field_or_pattern="ID", read_twice=False, by_chrom=False, missing_gene=True, parent="Parent") # add common options (-h/--help, ...) and parse command line (args) = E.start(parser, argv=argv) gffs = GFF3.flat_file_iterator(args.stdin) if args.by_chrom: gffs = GFF3.chrom_iterator(gffs) else: gffs = [gffs] # running early so that fails early if configuration is wrong if args.read_twice: # Will throw IOError if args.stdin is not a normal file second_gff = GFF3.flat_file_iterator(iotools.open_file( args.stdin.name)) if args.by_chrom: second_gff = GFF3.chrom_iterator(second_gff) else: second_gff = iter([second_gff]) else: second_gff = None for chunk in gffs: if args.read_twice: second_gff_chunk = next(second_gff) else: chunk = list(chunk) second_gff_chunk = chunk if args.method == "hierarchy": convert_hierarchy(chunk, second_gff_chunk, args) elif args.method == "set-field": gene_id_pattern = "%%(%s)s" % args.gene_field_or_pattern transcript_id_pattern = "%%(%s)s" % args.transcript_field_or_pattern convert_set(chunk, gene_id_pattern, transcript_id_pattern, args) elif args.method == "set-pattern": convert_set(chunk, args.gene_field_or_pattern, args.transcript_field_or_pattern, args) elif args.method == "set-none": convert_set(chunk, None, None, args) # write footer and output benchmark information. E.stop()
def main(argv=None): parser = E.ArgumentParser(description=__doc__) parser.add_argument("-g", "--genome-file", dest="genome_file", type=str, help="filename with genome.") parser.add_argument("-q", "--quality-file", dest="quality_file", type=str, help="filename with genomic base quality " "information.") parser.add_argument("-b", "--bam-file", dest="bam_files", type=str, metavar="bam", help="filename with read mapping information. " "Multiple files can be submitted in a " "comma-separated list.") parser.add_argument("-i", "--bigwig-file", dest="bigwig_file", type=str, metavar="bigwig", help="filename with bigwig information ") parser.add_argument("-f", "--gff-file", dest="filename_gff", type=str, action="append", metavar='bed', help="filename with extra gff files. The order " "is important.") parser.add_argument("--filename-format", dest="filename_format", type=str, choices=("bed", "gff", "gtf"), help="format of secondary stream.") parser.add_argument("--restrict-source", dest="gff_sources", type=str, action="append", help="restrict input to this 'source' in extra " "gff file (for counter: overlap).") parser.add_argument("--restrict-feature", dest="gff_features", type=str, action="append", help="restrict input to this 'feature' in extra gff " "file (for counter: overlap).") parser.add_argument("-r", "--reporter", dest="reporter", type=str, choices=("genes", "transcripts"), help="report results for 'genes' or 'transcripts' ") parser.add_argument("-s", "--section", dest="sections", type=str, action="append", choices=("exons", "introns"), help="select range on which counters will operate ") parser.add_argument( "-c", "--counter", dest="counters", type=str, action="append", choices=("bigwig-counts", "binding-pattern", "classifier", "classifier-rnaseq", "classifier-rnaseq-splicing", "classifier-polii", "composition-na", "composition-cpg", "coverage", "distance", "distance-genes", "distance-tss", "length", 'neighbours', "overlap", "overlap-stranded", "overlap-transcripts", "overrun", "position", "proximity", "proximity-exclusive", "proximity-lengthmatched", "quality", "read-coverage", "read-extension", "read-overlap", "read-counts", "read-fullcounts", "readpair-counts", "readpair-fullcounts", "splice", "splice-comparison", "territories"), help="select counters to apply to input ") parser.add_argument("--add-gtf-source", dest="add_gtf_source", action="store_true", help="add gtf field of source to output ") parser.add_argument("--proximal-distance", dest="proximal_distance", type=int, help="distance to be considered proximal to " "an interval.") parser.add_argument("--multi-mapping-method", dest="multi_mapping", type=str, choices=('all', 'ignore', 'weight'), help="how to treat multi-mapping reads in " "bam-files. Requires " "the NH flag to be set by the mapper ") parser.add_argument("--use-barcodes", dest="use_barcodes", action="store_true", help="Use barcodes to count unique umi's. " "UMI's are specified in the read identifier " "as the last field, where fields are separated " "by underscores, e.g. " "@READ:ILLUMINA:STUFF_NAMINGSTUFF_UMI. " "When true, unique counts are returned. " "Currently only compatible with count-reads") parser.add_argument("--sample-probability", dest="sample_probability", type=float, help="Specify the probability of whether any" "given read or read pair in a file bam is counted" "Currently only compatible with count-reads") parser.add_argument("--column-prefix", dest="prefixes", type=str, action="append", help="add prefix to column headers - prefixes " "are used in the same order as the counters ") parser.add_argument("--library-type", dest="library_type", type=str, choices=("unstranded", "firststrand", "secondstrand", "fr-unstranded", "fr-firststrand", "fr-secondstrand"), help="library type of reads in bam file. ") parser.add_argument("--min-mapping-quality", dest="minimum_mapping_quality", type=float, help="minimum mapping quality. Reads with a quality " "score of less will be ignored. ") parser.set_defaults(genome_file=None, reporter="genes", with_values=True, sections=[], counters=[], filename_gff=[], filename_format=None, gff_features=[], gff_sources=[], add_gtf_source=False, proximal_distance=10000, bam_files=None, multi_mapping='all', library_type='fr-unstranded', prefixes=[], minimum_mapping_quality=0, use_barcodes=False, sample_probability=1.0) if not argv: argv = sys.argv (args) = E.start(parser, add_output_options=True, argv=argv) if args.prefixes: if len(args.prefixes) != len(args.counters): raise ValueError("if any prefix is given, the number of prefixes " "must be the same as the number of counters") # get files if args.genome_file: fasta = IndexedFasta.IndexedFasta(args.genome_file) else: fasta = None if args.quality_file: quality = IndexedFasta.IndexedFasta(args.quality_file) quality.setTranslator(IndexedFasta.TranslatorBytes()) else: quality = None if args.bam_files: bam_files = [] for bamfile in args.bam_files.split(","): bam_files.append(pysam.AlignmentFile(bamfile, "rb")) else: bam_files = None if args.bigwig_file: bigwig_file = pyBigWig.open(args.bigwig_file) else: bigwig_file = None counters = [] if not args.sections: E.info("counters will use the default section (exons)") args.sections.append(None) if not args.gff_sources: args.gff_sources.append(None) if not args.gff_features: args.gff_features.append(None) cc = E.Counter() for n, c in enumerate(args.counters): if args.prefixes: prefix = args.prefixes[n] else: prefix = None if c == "position": for section in args.sections: counters.append( GeneModelAnalysis.CounterPosition(section=section, options=args, prefix=prefix)) elif c == "length": for section in args.sections: counters.append( GeneModelAnalysis.CounterLengths(section=section, options=args, prefix=prefix)) elif c == "splice": if fasta is None: raise ValueError('splice requires a genomic sequence') counters.append( GeneModelAnalysis.CounterSpliceSites(fasta=fasta, prefix=prefix)) elif c == "quality": if fasta is None: raise ValueError('quality requires a quality score sequence') counters.append( GeneModelAnalysis.CounterQuality(fasta=quality, prefix=prefix)) elif c == "overrun": counters.append( GeneModelAnalysis.CounterOverrun( filename_gff=args.filename_gff, options=args, prefix=prefix)) elif c == "read-coverage": counters.append( GeneModelAnalysis.CounterReadCoverage(bam_files, options=args, prefix=prefix)) elif c == "read-extension": counters.append( GeneModelAnalysis.CounterReadExtension( bam_files, filename_gff=args.filename_gff, options=args, prefix=prefix)) elif c == "read-overlap": counters.append( GeneModelAnalysis.CounterReadOverlap( bam_files, multi_mapping=args.multi_mapping, minimum_mapping_quality=args.minimum_mapping_quality, options=args, prefix=prefix)) elif c == "read-counts": counters.append( GeneModelAnalysis.CounterReadCounts( bam_files, multi_mapping=args.multi_mapping, use_barcodes=args.use_barcodes, sample_probability=args.sample_probability, minimum_mapping_quality=args.minimum_mapping_quality, options=args, prefix=prefix)) elif c == "read-fullcounts": counters.append( GeneModelAnalysis.CounterReadCountsFull( bam_files, multi_mapping=args.multi_mapping, sample_probability=args.sample_probability, minimum_mapping_quality=args.minimum_mapping_quality, options=args, prefix=prefix)) elif c == "readpair-counts": counters.append( GeneModelAnalysis.CounterReadPairCounts( bam_files, multi_mapping=args.multi_mapping, sample_probability=args.sample_probability, library_type=args.library_type, minimum_mapping_quality=args.minimum_mapping_quality, options=args, prefix=prefix)) elif c == "readpair-fullcounts": counters.append( GeneModelAnalysis.CounterReadPairCountsFull( bam_files, multi_mapping=args.multi_mapping, sample_probability=args.sample_probability, minimum_mapping_quality=args.minimum_mapping_quality, options=args, prefix=prefix)) elif c == "bigwig-counts": counters.append( GeneModelAnalysis.CounterBigwigCounts(bigwig_file, options=args, prefix=prefix)) elif c == "splice-comparison": if fasta is None: raise ValueError('splice-comparison requires a genomic ' 'sequence') counters.append( GeneModelAnalysis.CounterSpliceSiteComparison( fasta=fasta, filename_gff=args.filename_gff, feature=None, source=None, options=args, prefix=prefix)) elif c == "composition-na": if fasta is None: raise ValueError('composition-na requires a genomic sequence') for section in args.sections: counters.append( GeneModelAnalysis.CounterCompositionNucleotides( fasta=fasta, section=section, options=args, prefix=prefix)) elif c == "composition-cpg": if fasta is None: raise ValueError('composition-cpg requires a genomic sequence') for section in args.sections: counters.append( GeneModelAnalysis.CounterCompositionCpG(fasta=fasta, section=section, options=args, prefix=prefix)) elif c in ("overlap", "overlap-stranded", "overlap-transcripts", "proximity", "proximity-exclusive", "proximity-lengthmatched", "neighbours", "territories", "distance", "distance-genes", "distance-tss", "binding-pattern", "coverage"): if c == "overlap": template = GeneModelAnalysis.CounterOverlap if c == "overlap-stranded": template = GeneModelAnalysis.CounterOverlapStranded elif c == "overlap-transcripts": template = GeneModelAnalysis.CounterOverlapTranscripts elif c == "proximity": template = GeneModelAnalysis.CounterProximity elif c == "neighbours": template = GeneModelAnalysis.CounterNeighbours elif c == "proximity-exclusive": template = GeneModelAnalysis.CounterProximityExclusive elif c == "proximity-lengthmatched": template = GeneModelAnalysis.CounterProximityLengthMatched elif c == "territories": template = GeneModelAnalysis.CounterTerritories elif c == "distance": template = GeneModelAnalysis.CounterDistance elif c == "distance-genes": template = GeneModelAnalysis.CounterDistanceGenes elif c == "distance-tss": template = GeneModelAnalysis.CounterDistanceTranscriptionStartSites elif c == "coverage": template = GeneModelAnalysis.CounterCoverage elif c == "binding-pattern": template = GeneModelAnalysis.CounterBindingPattern for section in args.sections: for source in args.gff_sources: for feature in args.gff_features: counters.append( template(filename_gff=args.filename_gff, feature=feature, source=source, fasta=fasta, section=section, options=args, prefix=prefix)) elif c == "classifier": counters.append( GeneModelAnalysis.Classifier(filename_gff=args.filename_gff, fasta=fasta, options=args, prefix=prefix)) elif c == "classifier-rnaseq": counters.append( GeneModelAnalysis.ClassifierRNASeq( filename_gff=args.filename_gff, fasta=fasta, options=args, prefix=prefix)) elif c == "classifier-rnaseq-splicing": counters.append( GeneModelAnalysis.ClassifierRNASeqSplicing( filename_gff=args.filename_gff, fasta=fasta, options=args, prefix=prefix)) elif c == "classifier-polii": counters.append( GeneModelAnalysis.ClassifierPolII( filename_gff=args.filename_gff, feature=None, source=None, fasta=fasta, options=args, prefix=prefix)) elif c == "binding-pattern": counters.append( GeneModelAnalysis.CounterBindingPattern( filename_gff=args.filename_gff, feature=None, source=None, fasta=fasta, options=args, prefix=prefix)) if args.reporter == "genes": iterator = GTF.flat_gene_iterator header = ["gene_id"] fheader = lambda x: [x[0].gene_id] elif args.reporter == "transcripts": iterator = GTF.transcript_iterator header = ["transcript_id"] fheader = lambda x: [x[0].transcript_id] if args.add_gtf_source: header.append("source") ffields = lambda x: [x[0].source] else: ffields = lambda x: [] args.stdout.write("\t".join(header + [x.getHeader() for x in counters]) + "\n") for gffs in iterator(GTF.iterator(args.stdin)): cc.input += 1 for counter in counters: counter.update(gffs) skip = len([x for x in counters if x.skip]) == len(counters) if skip: cc.skipped += 1 continue args.stdout.write("\t".join( fheader(gffs) + ffields(gffs) + [str(counter) for counter in counters]) + "\n") cc.output += 1 E.info("%s" % str(cc)) for counter in counters: E.info("%s\t%s" % (repr(counter), str(counter.counter))) E.stop()
def main(argv=sys.argv): parser = E.ArgumentParser(description=__doc__) parser.add_argument("-t", "--no-titles", dest="input_has_titles", action="store_false", help="no titles in input .") parser.add_argument("--ignore-titles", dest="ignore_titles", action="store_true", help="ignore titles in input ") parser.add_argument("-i", "--skip-titles", dest="skip_titles", action="store_true", help="skip output of titles.") parser.add_argument("-m", "--missing-value", dest="missing_value", type=str, help="entry to use for missing values.") parser.add_argument("--header-names", dest="headers", type=str, help="add headers for files as a ,-separated " "list .") parser.add_argument("-c", "--columns", dest="columns", type=str, help="columns to use for joining. Multiple columns " "can be specified as a comma-separated list ") parser.add_argument("-k", "--take", dest="take", type=str, action="append", help="columns to take. If not set, all columns " "except for " "the join columns are taken ") parser.add_argument("-g", "--glob", dest="glob", type=str, help="wildcard expression for table names.") parser.add_argument( "-s", "--sort-order", dest="sort", type=str, help="sort by column titles in particular given order: " "alphabetical|numeric|list of columns.") parser.add_argument("-e", "--merge-overlapping", dest="merge", action="store_true", help="simply merge tables without matching up " "rows.") parser.add_argument("-a", "--cat", dest="cat", type=str, help="simply concatenate tables. Adds an " "additional column called X with the filename ") parser.add_argument("--sort-keys", dest="sort_keys", type=str, choices=("numeric", "alphabetic"), help="sort key columns by value.") parser.add_argument("--keep-empty", dest="ignore_empty", action="store_false", help="keep empty tables. The default is " "to ignore them.") parser.add_argument("--ignore-empty", dest="ignore_empty", action="store_true", help="ignore empty tables - this is " "the default .") parser.add_argument("--add-file-prefix", dest="add_file_prefix", action="store_true", help="add file prefix to " "columns headers. Suitable for multi-column" "tables") parser.add_argument("--use-file-prefix", dest="use_file_prefix", action="store_true", help="use file prefix as column headers. " "Suitable for two-column tables ") parser.add_argument("--prefixes", dest="prefixes", type=str, help="list of prefixes to use. " ", separated list of prefixes. " "The number of prefixes need to correspond to the " "number of input files") parser.add_argument("--regex-filename", dest="regex_filename", type=str, help="pattern to apply to filename to " "build prefix") parser.add_argument("--regex-start", dest="regex_start", type=str, help="regular expression to start " "collecting table in a file") parser.add_argument("--regex-end", dest="regex_end", type=str, help="regular expression to end collecting " "table in a file") parser.add_argument( "--sep", dest="separator", type=str, help="table separator to use. The default is to use tabs. ") parser.add_argument("--test", dest="test", type=int, help="test combining tables with " "first X rows") parser.set_defaults( input_has_titles=True, skip_titles=False, missing_value=None, headers=None, sort=None, glob=None, columns="1", sort_keys=False, merge=False, ignore_empty=True, regex_start=None, regex_end=None, add_file_prefix=False, use_file_prefix=False, cat=None, take=[], regex_filename="(.*)", prefixes=None, test=0, separator="\t", ) (args, unknown) = E.start(parser, argv=argv, unknowns=True) if args.headers: if "," in args.headers: args.headers = args.headers.split(",") else: args.headers = re.split("\s+", args.headers.strip()) if args.sort and args.sort not in ("numeric", "alphabetic"): if "," in args.sort: args.sort = args.sort.split(",") else: args.sort = re.split("\s+", args.sort) if args.merge: args.columns = [] else: args.columns = [int(x) - 1 for x in args.columns.split(",")] args.filenames = [] if args.glob: args.filenames += glob.glob(args.glob) args.filenames += unknown if len(args.filenames) < 1: raise ValueError("no tables found.") E.info("combining %i tables" % len(args.filenames)) if args.cat: table = concatenate_tables(args.filenames, regex_filename=args.regex_filename, separator=args.separator, headers=args.headers, missing_value=args.missing_value, cat=args.cat) table.to_csv(args.stdout, sep=args.separator, index=False) E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if not argv: argv = sys.argv # setup command line parser parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.set_defaults() # add common options (-h/--help, ...) and parse command line (args, unknown) = E.start(parser, argv=argv, add_output_options=True, unknowns=True) # do sth if len(unknown) == 1: fastqfile1 = unknown[0] fastqfile2 = args.output_filename_pattern % "2" elif len(unknown) == 2: fastqfile1, fastqfile2 = unknown else: fastqfile1 = args.output_filename_pattern % "1" fastqfile2 = args.output_filename_pattern % "2" # only output compressed data if not fastqfile1.endswith(".gz"): fastqfile1 += ".gz" if not fastqfile2.endswith(".gz"): fastqfile2 += ".gz" if args.stdin != sys.stdin: samfile = pysam.AlignmentFile(args.stdin.name, "rb") else: samfile = pysam.AlignmentFile("-", "rb") tmpdir = tempfile.mkdtemp() outtemp1 = os.path.join(tmpdir, "pair1.gz") outtemp2 = os.path.join(tmpdir, "pair2.gz") outstream1 = iotools.open_file(outtemp1, "w") outstream2 = iotools.open_file(outtemp2, "w") E.info('writing fastq files to temporary directory %s' % tmpdir) found1, found2 = set(), set() read1_qlen, read2_qlen = 0, 0 c = E.Counter() for read in samfile.fetch(until_eof=True): c.input += 1 if not read.is_paired: outstream1.write("\t".join((read.qname, read.seq, read.qual)) + "\n") found1.add(read.qname) if not read1_qlen: read1_qlen = read.qlen c.unpaired += 1 elif read.is_read1: outstream1.write("\t".join((read.qname, read.seq, read.qual)) + "\n") found1.add(read.qname) if not read1_qlen: read1_qlen = read.qlen c.output1 += 1 elif read.is_read2: if read.qname not in found2: outstream2.write("\t".join((read.qname, read.seq, read.qual)) + "\n") found2.add(read.qname) if not read2_qlen: read2_qlen = read.qlen c.output2 += 1 if c.unpaired == 0 and c.output1 == 0 and c.output2 == 0: E.warn("no reads were found") return sort_statement = '''gunzip < %s | sort -k1,1 | awk '{printf("@%%s\\n%%s\\n+\\n%%s\\n", $1,$2,$3)}' | gzip > %s''' if c.output1 == 0 and c.output2 == 0: # single end data: outstream1.close() outstream2.close() E.info("sorting fastq files") E.run(sort_statement % (outtemp1, fastqfile1)) else: # paired end data for qname in found2.difference(found1): outstream1.write("\t".join((qname, "N" * read1_qlen, "B" * read1_qlen)) + "\n") c.extra1 += 1 for qname in found1.difference(found2): outstream2.write("\t".join((qname, "N" * read2_qlen, "B" * read2_qlen)) + "\n") c.extra2 += 1 E.info("%s" % str(c)) outstream1.close() outstream2.close() E.info("sorting fastq files") E.run(sort_statement % (outtemp1, fastqfile1)) E.run(sort_statement % (outtemp2, fastqfile2)) shutil.rmtree(tmpdir) # write footer and output benchmark information. E.stop()
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv is None: argv = sys.argv parser = E.ArgumentParser(description=__doc__) parser.add_argument("--version", action='version', version="1.0") parser.add_argument("-u", "--unique", dest="unique", action="store_true", help="output rows are uniq.") parser.set_defaults( remove=False, unique=False, ) (args, unknown) = E.start(parser, add_csv_options=True, unknowns=True) if len(args) != 2: raise ValueError("please specify two files to join") args.filename1, args.filename2 = unknown table1 = readTable(iotools.open_file(args.filename1, "r")) table2 = readTable(iotools.open_file(args.filename2, "r")) if args.unique: outfile = UniqueBuffer(sys.stdout) else: outfile = args.stdout # build new field list new_fields = [] for x in args.join_fields1: new_fields.append(x) for x in fields1: if x not in args.join_fields1: new_fields.append(x) if x not in args.join_fields2: new_fields.append(x) writer = csv.DictWriter(outfile, fields, dialect=args.csv_dialect, lineterminator=args.csv_lineterminator, extrasaction='ignore') if len(lines) > 0: old_fields = lines[0][:-1].split("\t") if args.remove: fields = [] for x in old_fields: if x not in input_fields: fields.append(x) else: fields = input_fields reader = csv.DictReader(lines, dialect=args.csv_dialect) print("\t".join(fields)) first_row = True for row in reader: row = iotools.convertDictionary(row) writer.writerow(row) E.stop()