def main(): def add_args(parser): parser.add_argument("-c", "--chromosomes", action="extend_overwrite", type="str_list", default=[str(c) for c in xrange(1, 20)] + ["X", "M"]) parser.add_argument("-m", "--mt_window_size", type=int, default=100) parser.add_argument("-w", "--window_size", type=int, default=1000000) parser.add_argument( "--start", type=int, default=3000000, help="Start of first window for autosomal and X chromosomes.") parser.add_argument("genome_file", type="readable_file") parser.add_argument("output_file", type="writeable_file") args = parse(add_args) genome = csv_to_dict(args.genome_file, delim="\t") with open(args.output_file, "w") as o: for chrm in args.chromosomes: chrm_name = "chr{0}".format(chrm) chrm_size = int(genome[chrm_name][1]) window_size = args.mt_window_size if chrm == "M" else args.window_size for pos in xrange(args.start, chrm_size, window_size): if pos < chrm_size: o.write("{0}\t{1}\t{2}\n".format( chrm_name, pos, min(pos + window_size, chrm_size)))
def main(): def parse_args(parser): parser.add_argument("-c", "--cleanup", action="store_true", default=False) parser.add_argument("-w", "--working_dir", type="writeable_dir", default=None) parser.add_argument("--pandoc_exe", default="pandoc") parser.add_argument("--pdflatex_exe", default="pdflatex") parser.add_argument("--open_pdf", action="store_true", default=False) parser.add_argument("dir", type="readable_dir") parser.add_argument("outfile", type="writeable_file") args = parse(parse_args) latex_dir = os.path.join(args.dir, "latex") assert os.path.exists(latex_dir), "Missing directory: {0}".format(latex_dir) section_dir = os.path.join(args.dir, "sections") assert os.path.exists(section_dir), "Missing directory: {0}".format(section_dir) bib_dir = os.path.join(args.dir, "bib") assert os.path.exists(bib_dir), "Missing directory: {0}".format(bib_dir) pdf = compile_pdf(latex_dir, section_dir, bib_dir, args.outfile, args.working_dir, args.cleanup, args.pandoc_exe, args.pdflatex_exe) if args.open_pdf: bash("open {0} &".format(pdf), catch=False)
def main(argv=None): def add_args(parser): parser.add_argument('-a', '--append', action='store_true', default=False) parser.add_argument('-h', '--header', default=0, help="Either an integer that is the number of header lines in the input file "\ "(use negative numbers to suppress copying of headers to output files) or "\ "a string that should be used as the header.") parser.add_argument('-P', '--pattern_file', type='readable_file', metavar="FILE") parser.add_argument('-p', '--pattern', type='mapping', action='append', metavar="PATTERN=OUTPUT") parser.add_argument( '-u', '--unmatched_file', type='writeable_file', metavar="FILE", help="File in which to write lines that do not match any pattern.") parser.add_argument('file', type='readable_file', metavar="FILE") ns = parse(add_args, args=argv) header = None if isinstance(ns.header, str): if ns.header.isdigit(): header = int(ns.header) else: header = [ns.header] if ns.pattern_file: mod = load_module_from_file(ns.pattern_file) if not hasattr(mod, 'handlers'): raise Exception("Invalid pattern file: %s" % ns.pattern_file) else: handlers = mod.handlers else: handlers = (DefaultFileHandler(m[0], m[1], ns.append) for m in ns.pattern) if handlers: unmatched = None if ns.unmatched_file: unmatched = open(ns.unmatched_file, 'w') try: split(ns.file, handlers, header, unmatched) finally: if unmatched: unmatched.close() else: print "No handlers specified; doing nothing."
def main(argv=None): def add_opts(parser): parser.add_argument("-d", "--delim", default=",") parser.add_argument("-r", "--replace", default=":-") parser.add_argument("infile", type='readable_file') parser.add_argument("outfile", type='writeable_file') ns = parse(add_opts, args=argv) with open(ns.infile, 'rU') as i, open(ns.outfile, 'w') as o: convert(i, o, ns.delim, ns.replace)
def main(): def add_opts(parser): parser.add_argument("fasta", type="readable_file") parser.add_argument("seqs", type="readable_file") parser.add_argument("outdir", type="writeable_dir") args = parse(add_opts) with open(args.fasta, "rU") as f: for record in SeqIO.parse(f, "fasta"): outfile = os.path.join(args.outdir, "{0}.txt".format(record.id)) cmd = 'grep "{0}" "{1}" > "{2}"'.format(record.seq, args.seqs, outfile) bash(cmd)
def main(argv=None): def add_opts(parser): parser.add_argument("-d", "--decimal", action="store_true", default=False) parser.add_argument("-w", "--window_size", default=100, type=int) parser.add_argument("infile", type='readable_file') parser.add_argument("outfile", type='writeable_file') ns = parse(add_opts, args=argv) with open(ns.infile, 'rU') as inp: seq = inp.read().replace("\n","") cg_pct(seq, ns.outfile, ns.window_size, ns.decimal)
def main(): def add_args(parser): parser.add_argument('infile', type='readable_file', metavar='FILE') parser.add_argument('outdir', type='writeable_dir', default='.', nargs='?') ns = parse(add_args) outfile = None for line in fileinput.input(ns.infile): if line[0] == ">": if outfile is not None: outfile.close() fname = os.path.join(ns.outdir, "{0}.fasta".format(line[1:-1])) outfile = open(fname, 'w') outfile.write(line)
def main(argv=None): def add_opts(parser): parser.add_argument("-d", "--decimal", action="store_true", default=False) parser.add_argument("-w", "--window_size", default=100, type=int) parser.add_argument("infile", type='readable_file') parser.add_argument("outfile", type='writeable_file') ns = parse(add_opts, args=argv) with open(ns.infile, 'rU') as inp: seq = inp.read().replace("\n", "") cg_pct(seq, ns.outfile, ns.window_size, ns.decimal)
def main(): def add_arguments(parser): parser.add_argument('-d', '--delim', default="\t") parser.add_argument('infile', type='readable_file') parser.add_argument('termfile', type='readable_file') parser.add_argument('outfile', type='writeable_file', nargs='?', default=None) ns = parse(add_arguments) with open(ns.infile, 'rU') as f: s = f.read() for find, repl in csv_to_table(ns.termfile, ns.delim): s = re.sub(find, repl, s) outfile = ns.outfile or ns.infile write_file(outfile, s)
def main(argv=None): def add_args(parser): parser.add_argument('-a', '--append', action='store_true', default=False) parser.add_argument('-h', '--header', default=0, help="Either an integer that is the number of header lines in the input file "\ "(use negative numbers to suppress copying of headers to output files) or "\ "a string that should be used as the header.") parser.add_argument('-P', '--pattern_file', type='readable_file', metavar="FILE") parser.add_argument('-p', '--pattern', type='mapping', action='append', metavar="PATTERN=OUTPUT") parser.add_argument('-u', '--unmatched_file', type='writeable_file', metavar="FILE", help="File in which to write lines that do not match any pattern.") parser.add_argument('file', type='readable_file', metavar="FILE") ns = parse(add_args, args=argv) header = None if isinstance(ns.header, str): if ns.header.isdigit(): header = int(ns.header) else: header = [ns.header] if ns.pattern_file: mod = load_module_from_file(ns.pattern_file) if not hasattr(mod, 'handlers'): raise Exception("Invalid pattern file: %s" % ns.pattern_file) else: handlers = mod.handlers else: handlers = (DefaultFileHandler(m[0], m[1], ns.append) for m in ns.pattern) if handlers: unmatched = None if ns.unmatched_file: unmatched = open(ns.unmatched_file, 'w') try: split(ns.file, handlers, header, unmatched) finally: if unmatched: unmatched.close() else: print "No handlers specified; doing nothing."
metavar="COMMAND", default=None, help="Command to run on piped process output.") output_group = parser.add_mutually_exclusive_group() output_group.add_argument("-r", "--result_file", type="writeable_file", metavar="FILE", help="File where all results are written in CSV format (one column for each variable "\ "value followed by a column with the result). All fields are quoted.") output_group.add_argument( "-R", "--result_file_pattern", metavar="PATTERN", help= "Pattern from which output file is created by interpolation with variable values." ) ns = parse(add_arguments, args=sys.argv[1:i]) prog = sys.argv[i] args = sys.argv[i + 1:] argvars = VarArgGenerator() if ns.var_file: config = SafeConfigParser() config.read(ns.var_file) if config.has_section('constants'): argvars.update(config.items('constants')) if config.has_section['variables']: argvars.update( dict(parse_vars(m) for m in config.items('variables')))
def main(): def add_args(parser): parser.add_argument("-k", "--kmer_sizes", type="int_list", action="extend_overwrite", default=(1, )) parser.add_argument("-r", "--read_length", type=int, default=100) parser.add_argument("--bigint", action="store_true", default=False) parser.add_argument("--log", type="writeable_file", default=None) parser.add_argument("--log_interval", type=int, default=100000) parser.add_argument("--prefix", default=None) parser.add_argument("--summary_only", action="store_true", default=False) parser.add_argument("input_file", type="readable_file") parser.add_argument("output_dir", type="writeable_dir", nargs="?", default=".") args = parse(add_args) dtype = np.int64 if args.bigint else np.int32 total_mem = sum( SeqDiv.estimate_memory(args.read_length, k, dtype) for k in args.kmer_sizes) / 1000000.0 sys.stderr.write( "This program will use up to {0} MiB of memory\n".format(total_mem)) kmers = dict((k, SeqDiv(k, dtype)) for k in args.kmer_sizes) log = None start = None if args.log is not None: from datetime import datetime start = datetime.now() log = open(args.log, "w", 0) log.write("Starting at {0}\n".format(start)) log.flush() for read_num, read in enumerate( fileinput.input(args.input_file, mode="rU"), 1): # TODO: this could be threaded if it's too slow read = read.strip() read_len = len(read) if read_len != args.read_length: sys.exit( "Invalid read length at read {0}: expected {1}, actual {2}". format(read_num, args.read_length, read_len)) for sd in kmers.values(): sd.insert_nocheck(read, read_len) if log is not None and read_num % args.log_interval == 0: now = datetime.now() log.write("Processed {0} reads in {1} hours\n".format( read_num, round((now - start).total_seconds() / 3600, 3))) log.flush() prefix = args.prefix if prefix is None: prefix = os.path.splitext(os.path.basename(args.input_file))[0] for k in kmers.keys(): summary_file = os.path.join( args.output_dir, "{0}_summary_{1}mers.csv".format(prefix, k)) with open(summary_file, "w") as o: kmers[k].write_summary(o) if not args.summary_only: count_file = os.path.join( args.output_dir, "{0}_counts_{1}mers.csv".format(prefix, k)) with open(count_file, "w") as o: kmers[k].write_counts(o) if log is not None: log.close()
def main(): def add_args(parser): parser.add_argument("-b", "--bfile", type=readable_file_group(("bed", "bim", "fam")), default=None, help="Prefix of plink bfiles (bed, bim and fam).") parser.add_argument("-c", "--chromosomes", type=delimited_macro("chrm"), default="F", help="Set of chromosomes on which to execute the command(s)") parser.add_argument("-f", "--fork_mode", choices=("test","serial","thread","lsf"), default="thread", help="How to distribute jobs.") parser.add_argument("-i", "--interval_size", type=int, default=1000, metavar="bp", help="Bin size for interval statistics (in bp)") parser.add_argument("-k", "--keep_file", type="readable_file", metavar="FILE", default=None, help="Plink keep file (list of samples to include).") parser.add_argument("-s", "--plot_stats", action="extend_overwrite", type="str_list", metavar="LIST", default=("mean","max","pct"), help="List of stats for which to create (mean, max, pct)") parser.add_argument("-S", "--summary_stat", action="extend_overwrite", choices=("all", "unlinked"), default="unlinked", help="How to compute bin summary stats (use all markers or only unlinked markers)") parser.add_argument("-p", "--percentile", type=int, default=95, metavar="PCT", help="Percentile for r-squared statistics.") parser.add_argument("-r", "--r2_bin_size", type=float, default=0.01, help="Bin size for r-squared histogram.") parser.add_argument("-w", "--window_size", type=int, default=500, metavar="Kb", help="Window size (in kb)") parser.add_argument("--fork_opts", action="extend_dict", type="mapping_list", default={}, help="Options specific to the fork mode.") parser.add_argument("--unfiltered", action="store_true", default=False, help="Assume data files are unfiltered and apply uncalled and maf filters.") parser.add_argument("--no_unlinked", action="store_true", default=False, help="Don't compute stats for unlinked markers.") parser.add_argument("--per_chromosome", action="store_true", default=False, help="Whether the data file has been split into one per chromosome.") parser.add_argument("--block_r2_thresholds", type=float, nargs=2, default=(0.2, 0.95), help="Lower and upper r-square thresholds for identifying haplotype blocks with 'make_blocks' command.") parser.add_argument("--output_format", choices=("ped","bed","tped"), default="ped", help="Output format for commands that produce PLINK data files.") parser.add_argument("--new_plink", default="/Users/johndidion/software/plink_mac/plink") parser.add_argument("--old_plink", default="plink") parser.add_argument("outdir", type="writeable_dir") parser.add_argument("commands", action="extend", nargs="+", choices=COMMANDS, help="Commands to run. If none are specified, all will be run.") ns = parse(add_args) geno.plink.PLINK_CMD = ns.new_plink geno.plink.PLINK_OLD = ns.old_plink if ns.bfile: bedfile, bimfile, famfile = ns.bfile bfile = os.path.splitext(bedfile)[0] fname = os.path.basename(bfile) window_size = ns.window_size * 1000 commands = ns.commands if ns.commands else COMMANDS mkdir(ns.outdir, overwrite=False) if any(c in FORK_COMMANDS for c in commands): executor = get_executor(ns.fork_mode, ns.fork_opts) # partition the genome into windows and generate a list of SNPs in each window if "snps" in commands: # this command is not forked but is not time-consuming chr_file, win_file = partition_snps(bimfile, ns.outdir, window_size) else: win_file = os.path.join(ns.outdir, "windows.csv") # execute the plink --r2 command over snp windows (requires 'snps' command) if "plink" in commands: cmd_iter = pairwise_ld_command_iter(bfile, ns.outdir, win_file, apply_filters=ns.unfiltered) exec_shell(cmd_iter, executor, error_handler=reraise_error) # process the results of the 'plink' command if "process" in commands: # this command is not forked. it should be submitted to lsf if run on kure. process_ld_files(ns.outdir, ns.percentile, ns.interval_size, ns.r2_bin_size, unlinked=not ns.no_unlinked) # generate heatmap plots from the results of the 'process' command if "plot" in commands: bins = os.path.join(ns.outdir, "window_summary.csv") for s in ns.plot_stats: summary_stat = "{0}_{1}".format(ns.summary_stat, s) plot_ld_heatmap( os.path.join(ns.outdir, "{0}_r2_matrix.csv".format(s)), bins, os.path.join(ns.outdir, "{0}_heatmap.pdf".format(s)), window_size=window_size, summary_stat=summary_stat, tics=True) if "local" in commands: # TODO: execute commands for local LD pass if "process_local" in commands: process_local_ld_file(os.path.join(ns.outdir, "local.ld"), os.path.join(ns.outdir, "r2_hist.csv")) per_chrm = ns.per_chromosome # split a whole-genome data file into one file per chromosome if "split" in commands: cmd_iter = split(bfile, ns.outdir, chromosomes=ns.chromosomes, output_format=ns.output_format) exec_shell(cmd_iter, executor, error_handler=reraise_error) per_chrm = True # execute the plink --blocks command if "blocks" in commands: # this command is only forked if run on split files (1 per chr). it should be submitted to lsf if run on kure. if per_chrm: cmd_iter = per_chrm_iter(bfile, ns.outdir, format_ld_blocks_command, ns.chromosomes, apply_filters=ns.unfiltered) exec_shell(cmd_iter, executor, error_handler=reraise_error) else: cmd = format_ld_blocks_command(bfile, ns.outdir, apply_filters=ns.unfiltered) bash(cmd) if "block_ld" in commands: if per_chrm: path = os.path.join(ns.outdir, "**", "*.blocks.det") else: path = os.path.join(ns.outdir, "*.blocks.det") lines = fileinput.input(glob.glob(path)) cmd_iter = block_ld(lines, bfile, ns.outdir) exec_shell(cmd_iter, executor, error_handler=reraise_error) if "prune_blocks" in commands: prune_blocks(ns.outdir, ns.chromosomes) if "make_blocks" in commands: def arg_iter(): for chrm in ns.chromosomes: yield (chrm, os.path.join(ns.outdir, "chr{0}".format(chrm), fname)) distribute(call_make_blocks, arg_iter(), mode=ns.fork_mode, **ns.fork_opts) if "tag_snps" in commands: bash(format_merge_tag_lists_command(ns.outdir, fname, ns.chromosomes))
def main(): def add_args(parser): parser.add_argument("-b", "--bfile", type=readable_file_group(("bed", "bim", "fam")), default=None, help="Prefix of plink bfiles (bed, bim and fam).") parser.add_argument( "-c", "--chromosomes", type=delimited_macro("chrm"), default="F", help="Set of chromosomes on which to execute the command(s)") parser.add_argument("-f", "--fork_mode", choices=("test", "serial", "thread", "lsf"), default="thread", help="How to distribute jobs.") parser.add_argument("-i", "--interval_size", type=int, default=1000, metavar="bp", help="Bin size for interval statistics (in bp)") parser.add_argument( "-k", "--keep_file", type="readable_file", metavar="FILE", default=None, help="Plink keep file (list of samples to include).") parser.add_argument( "-s", "--plot_stats", action="extend_overwrite", type="str_list", metavar="LIST", default=("mean", "max", "pct"), help="List of stats for which to create (mean, max, pct)") parser.add_argument( "-S", "--summary_stat", action="extend_overwrite", choices=("all", "unlinked"), default="unlinked", help= "How to compute bin summary stats (use all markers or only unlinked markers)" ) parser.add_argument("-p", "--percentile", type=int, default=95, metavar="PCT", help="Percentile for r-squared statistics.") parser.add_argument("-r", "--r2_bin_size", type=float, default=0.01, help="Bin size for r-squared histogram.") parser.add_argument("-w", "--window_size", type=int, default=500, metavar="Kb", help="Window size (in kb)") parser.add_argument("--fork_opts", action="extend_dict", type="mapping_list", default={}, help="Options specific to the fork mode.") parser.add_argument( "--unfiltered", action="store_true", default=False, help= "Assume data files are unfiltered and apply uncalled and maf filters." ) parser.add_argument("--no_unlinked", action="store_true", default=False, help="Don't compute stats for unlinked markers.") parser.add_argument( "--per_chromosome", action="store_true", default=False, help="Whether the data file has been split into one per chromosome." ) parser.add_argument( "--block_r2_thresholds", type=float, nargs=2, default=(0.2, 0.95), help= "Lower and upper r-square thresholds for identifying haplotype blocks with 'make_blocks' command." ) parser.add_argument( "--output_format", choices=("ped", "bed", "tped"), default="ped", help="Output format for commands that produce PLINK data files.") parser.add_argument( "--new_plink", default="/Users/johndidion/software/plink_mac/plink") parser.add_argument("--old_plink", default="plink") parser.add_argument("outdir", type="writeable_dir") parser.add_argument( "commands", action="extend", nargs="+", choices=COMMANDS, help="Commands to run. If none are specified, all will be run.") ns = parse(add_args) geno.plink.PLINK_CMD = ns.new_plink geno.plink.PLINK_OLD = ns.old_plink if ns.bfile: bedfile, bimfile, famfile = ns.bfile bfile = os.path.splitext(bedfile)[0] fname = os.path.basename(bfile) window_size = ns.window_size * 1000 commands = ns.commands if ns.commands else COMMANDS mkdir(ns.outdir, overwrite=False) if any(c in FORK_COMMANDS for c in commands): executor = get_executor(ns.fork_mode, ns.fork_opts) # partition the genome into windows and generate a list of SNPs in each window if "snps" in commands: # this command is not forked but is not time-consuming chr_file, win_file = partition_snps(bimfile, ns.outdir, window_size) else: win_file = os.path.join(ns.outdir, "windows.csv") # execute the plink --r2 command over snp windows (requires 'snps' command) if "plink" in commands: cmd_iter = pairwise_ld_command_iter(bfile, ns.outdir, win_file, apply_filters=ns.unfiltered) exec_shell(cmd_iter, executor, error_handler=reraise_error) # process the results of the 'plink' command if "process" in commands: # this command is not forked. it should be submitted to lsf if run on kure. process_ld_files(ns.outdir, ns.percentile, ns.interval_size, ns.r2_bin_size, unlinked=not ns.no_unlinked) # generate heatmap plots from the results of the 'process' command if "plot" in commands: bins = os.path.join(ns.outdir, "window_summary.csv") for s in ns.plot_stats: summary_stat = "{0}_{1}".format(ns.summary_stat, s) plot_ld_heatmap(os.path.join(ns.outdir, "{0}_r2_matrix.csv".format(s)), bins, os.path.join(ns.outdir, "{0}_heatmap.pdf".format(s)), window_size=window_size, summary_stat=summary_stat, tics=True) if "local" in commands: # TODO: execute commands for local LD pass if "process_local" in commands: process_local_ld_file(os.path.join(ns.outdir, "local.ld"), os.path.join(ns.outdir, "r2_hist.csv")) per_chrm = ns.per_chromosome # split a whole-genome data file into one file per chromosome if "split" in commands: cmd_iter = split(bfile, ns.outdir, chromosomes=ns.chromosomes, output_format=ns.output_format) exec_shell(cmd_iter, executor, error_handler=reraise_error) per_chrm = True # execute the plink --blocks command if "blocks" in commands: # this command is only forked if run on split files (1 per chr). it should be submitted to lsf if run on kure. if per_chrm: cmd_iter = per_chrm_iter(bfile, ns.outdir, format_ld_blocks_command, ns.chromosomes, apply_filters=ns.unfiltered) exec_shell(cmd_iter, executor, error_handler=reraise_error) else: cmd = format_ld_blocks_command(bfile, ns.outdir, apply_filters=ns.unfiltered) bash(cmd) if "block_ld" in commands: if per_chrm: path = os.path.join(ns.outdir, "**", "*.blocks.det") else: path = os.path.join(ns.outdir, "*.blocks.det") lines = fileinput.input(glob.glob(path)) cmd_iter = block_ld(lines, bfile, ns.outdir) exec_shell(cmd_iter, executor, error_handler=reraise_error) if "prune_blocks" in commands: prune_blocks(ns.outdir, ns.chromosomes) if "make_blocks" in commands: def arg_iter(): for chrm in ns.chromosomes: yield (chrm, os.path.join(ns.outdir, "chr{0}".format(chrm), fname)) distribute(call_make_blocks, arg_iter(), mode=ns.fork_mode, **ns.fork_opts) if "tag_snps" in commands: bash(format_merge_tag_lists_command(ns.outdir, fname, ns.chromosomes))
def main(argv=None): def add_opts(parser): parser.add_argument('-H', '--homozygosity_cutoff', type=int, default=97, help="Percent homozygosity required to declare a region homozygous.") parser.add_argument('-k', '--karyotype_file', type='readable_file', default=None, help="File containing the karyotype.") parser.add_argument('-m', '--smoothing_size', type=int, default=20, help="Window size to use when smoothing regions.") parser.add_argument('-s', '--sample_file', type='readable_file', default=None, help="File containing sample information to ") parser.add_argument('-w', '--window_size', type=int, default=300, help="Number of markers in sliding classification window.") parser.add_argument('-W', '--window_slide', type=int, default=1, help="Number of markers to slide the window.") parser.add_argument('-x', '--exclude_file', type='readable_file', default=None, help="File containing regions to ignore.") parser.add_argument('--genotype_format', choices=['num', 'bin'], default='num', help="Format of genotypes: num = -1/1/2/3/4, bin = -1=N, 0=hom, 1=het.") parser.add_argument('--max_smoothing_iterations', type=int, default=100, help="Maximimum iterations to spend smoothing a sequence.") parser.add_argument('genotype_file', type='readable_file', help="File containing genotypes for samples, one sample per row.") parser.add_argument('output_dir', type='writeable_dir', help="Directory to write results, one file per chromosome.") ns = parse(add_opts, args=argv) if log.is_debug(): log.debug("find_intervals.py called with args: {0}".format(ns)) samples = read_samples(ns.sample_file) if log.is_debug(): log.debug("{0} samples".format(len(samples))) exclude = read_exclude(ns.exclude_file) if log.is_debug(): if exclude: log.debug("Exclude regions on chromosomes {0}".format(exclude.keys())) else: log.debug("No exclude regions") chrom_sizes = dict((c.name, c.size) for c in karyotype(ns.karyotype_file)) chromosomes = [] ordered_samples = [] genotypes = {} with open(ns.genotype_file, 'rU') as f: r = reader(f) # create list of tuples: SNPID, chromosome, position head_iter = zipiter(r.next() for i in xrange(0, 3)) head_iter.next() # remove header column snps = map(lambda x: SNP(*x), head_iter) nsnp = len(snps) if log.is_debug(): log.debug("{0} snps".format(nsnp)) chrom = None start = None for i in xrange(0, nsnp): s = snps[i] if s.chromosome != chrom: if start: chromosomes.append(Chromosome(chrom, chrom_sizes[chrom], start, i)) chrom = s.chromosome start = i chromosomes.append(Chromosome(chrom, chrom_sizes[chrom], start, nsnp)) if log.is_debug(): log.debug("{0} chromosomes".format(len(chromosomes))) for sample in r: name = sample.pop(0) # pop off header column ordered_samples.append(samples.get(name, name)) for chrom in chromosomes: genotypes.setdefault(chrom.name, []).append( Genotypes(chrom.slice(sample), ns.genotype_format)) min_hom = float(ns.homozygosity_cutoff) / 100 smoother = Smoother(ns.smoothing_size, ns.max_smoothing_iterations) if not os.path.exists(ns.output_dir): os.makedirs(ns.output_dir) for chrom in chromosomes: log.info("Processing chromosome {0}".format(chrom.name)) # break genotypes into regions, scan each region using a sliding window # and classify each window as homozygous or heterozygous regions = classify_windows(View(snps, chrom), genotypes[chrom.name], wrapiter(exclude.get(chrom.name, None)), ns.window_size, ns.window_slide, min_hom, ns.smoothing_size) int_file = os.path.join(ns.output_dir, "intervals_chr%s.csv" % chrom.name) min_file = os.path.join(ns.output_dir, "minimal_chr%s.csv" % chrom.name) # smooth out each region and partition into hom/het intervals with open(int_file, 'w') as iout, open(min_file, 'w') as mout: int_writer = writer(iout) int_writer.writerow(('Sample','Start','End','Martkers','Call')) min_writer = writer(mout) min_writer.writerow(['Start','End','Markers'] + ordered_samples) for r in regions: partition(r, ordered_samples, smoother, int_writer, min_writer)
help="Define a variable whose values come frome FILE") parser.add_argument("-w", "--sliding_window", metavar="NAME=LOW,HIGH,STEP", action="dict", type="delimited_mapping", help="Define a sliding window variable.") pipe_group = parser.add_argument_group("pipes", pipe_help) pipe_group.add_argument("--pipe_file_pattern", metavar="PATTERN", default=None, help="Process output file to send to pipe_command.") pipe_group.add_argument("--pipe_command_pattern", metavar="COMMAND", default=None, help="Command to run on piped process output.") output_group = parser.add_mutually_exclusive_group() output_group.add_argument("-r", "--result_file", type="writeable_file", metavar="FILE", help="File where all results are written in CSV format (one column for each variable "\ "value followed by a column with the result). All fields are quoted.") output_group.add_argument("-R", "--result_file_pattern", metavar="PATTERN", help="Pattern from which output file is created by interpolation with variable values.") ns = parse(add_arguments, args=sys.argv[1:i]) prog = sys.argv[i] args = sys.argv[i+1:] argvars = VarArgGenerator() if ns.var_file: config = SafeConfigParser() config.read(ns.var_file) if config.has_section('constants'): argvars.update(config.items('constants')) if config.has_section['variables']: argvars.update(dict(parse_vars(m) for m in config.items('variables')))