def main(arguments=None): """Main method""" arguments = sys.argv[1:] if arguments is None else arguments parser = generate_argparser() args = parser.parse_args(args=arguments) # HELP MENU if args.morehelp: modulehelp(MODULENAMES) sys.exit() # ESTABLISH MVF mvf = MultiVariantFile(args.mvf, 'read') # Argument Pre-processing if args.allele_groups: groups = {} for elem in args.allele_groups: elem = elem.split(':') groups[elem[0]] = mvf.get_sample_indices(labels=elem[1].split(',')) args.allele_groups = groups.copy() for grp0, grp1 in combinations(groups, 2): if set(groups[grp0]) & set(groups[grp1]): raise RuntimeError("Groups contain same element", set(groups[grp0]) & set(groups[grp1])) if args.speciesgroups: groups = {} for elem in args.speciesgroups: elem = elem.split(':') groups[elem[0]] = mvf.get_sample_indices(labels=elem[1].split(',')) args.speciesgroups = groups.copy() for specgroup in groups: ngroup = 0 for allelegroup in args.allele_groups.values(): if set(allelegroup) & set(groups[specgroup]): ngroup += 1 if ngroup > 1: raise RuntimeError(specgroup, "split across 2+ groups") # MODULES if args.module == 'Coverage': module = Coverage(params=vars(args)) elif args.module == 'GroupUniqueAlleleWindow': module = GroupUniqueAlleleWindow(params=vars(args)) elif args.module == 'PiDiversityWindow': module = PiDiversityWindow(params=vars(args)) elif args.module == 'PairwiseNS': module = PairwiseNS(params=vars(args)) # RUN MODULE module.analyze(mvf) return ''
def main(arguments=None): """Main method""" arguments = sys.argv[1:] if arguments is None else arguments parser = generate_argparser() args = parser.parse_args(args=arguments) mvf = MultiVariantFile(args.mvf, 'read') flavor = mvf.metadata['flavor'] if (flavor in ("dna", "rna") and args.outdata == "prot") or ( flavor == "prot" and args.outdata in ("dna", "rna")): raise RuntimeError( "--outdata {} incompatiable with '{}' flavor mvf".format( args.outdata, flavor)) sample_cols = mvf.get_sample_indices(args.samples or None) labels = mvf.get_sample_labels(sample_cols) current_contig = '' seqs = {} for contig, _, allelesets in mvf.iterentries(quiet=args.quiet, decode=True): if contig != current_contig: if seqs: with open( "{}.{}.fa".format( args.outprefix, mvf.metadata['contigs'][contig]['label']), 'wt') as outfile: for seqname in sorted(seqs): outfile.write(">{}\n{}\n".format( seqname, ''.join(seqs[seqname]))) seqs = None seqs = {} current_contig = contig[:] for col, label in zip(sample_cols, labels): if label not in seqs: seqs[label] = [] if flavor in ('dna', 'rna'): seqs[label].append(allelesets[0][col] == 'X' and 'N' or allelesets[0][col]) elif flavor in ('codon', 'prot') and (args.outdata == 'prot'): seqs[label].append(allelesets[0][col]) elif flavor == 'codon' and args.outdata == 'dna': seqs[label].extend([ allelesets[x][col] == 'X' and 'N' or allelesets[x][col] for x in (1, 2, 3) ]) if seqs: with open( "{}.{}.fa".format(args.outprefix, mvf.metadata['contigs'][contig]['label']), 'wt') as outfile: for seqname in sorted(seqs): outfile.write(">{}\n{}\n".format(seqname, ''.join(seqs[seqname]))) seqs = None seqs = {} return ''
def main(arguments=sys.argv[1:]): """Main method for mvf_filter""" parser = argparse.ArgumentParser(description=""" Filters and Transforms MVF files""") parser.add_argument("--mvf", help="input MVF file") parser.add_argument("--out", help="output MVF file") parser.add_argument("--actions", nargs='*', help=("set of actions:args to perform," " note these are done in order as listed")) parser.add_argument("--test", help="manually input a line for testing") parser.add_argument("--testnchar", type=int, help="total number of samples for test string") parser.add_argument("--modulehelp", action="store_true", help="prints full module list and descriptions") parser.add_argument("--linebuffer", type=int, default=100000, help="number of lines to write at once to MVF") parser.add_argument("--verbose", action="store_true", help="report every line (for debugging)") parser.add_argument("--overwrite", action="store_true", help="USE WITH CAUTION: force overwrite of outputs") parser.add_argument("--quiet", action="store_true", help="suppress progress meter") parser.add_argument("-v", "--version", action="store_true", help="display version information") args = parser.parse_args(args=arguments) if args.version: print("Version 2015-02-26") sys.exit() args = parser.parse_args(args=arguments) time0 = time() if args.modulehelp: modulehelp() if not args.mvf and not args.test: raise RuntimeError("No input file specified with --mvf") if not args.out and not args.test: raise RuntimeError("No output file specified with --outs") if not args.actions: raise RuntimeError("No --actions specified!") ## Establish Input MVF if args.test: ncol = args.testnchar or len(args.test) else: mvf = MultiVariantFile(args.mvf, 'read') ncol = mvf.metadata['ncol'] ## Create Actionset actionset = build_actionset(args.actions, ncol) ##TESTING MODE if args.test: loc, alleles = args.test.split() linefail = False transformed = False #invar = invariant (single character) #refvar (all different than reference, two chars) #onecov (single coverage, + is second character) #onevar (one variable base, + is third character) #full = full alleles (all chars) if args.verbose: print(alleles) linetype = get_linetype(alleles) sys.stdout.write("MVF Encoding type '{}' detected\n".format(linetype)) for actionname, actiontype, actionfunc, actionarg in actionset: sys.stdout.write("Applying action {} ({}): ".format( actionname, actiontype)) if actiontype == 'filter': if not actionfunc(alleles, linetype): linefail = True sys.stdout.write("Filter Fail\n") break else: sys.stdout.write("Filter Pass\n") elif actiontype == 'transform': transformed = True alleles = actionfunc(alleles, linetype) linetype = get_linetype(alleles) if linetype == 'empty': linefail = True sys.stdout.write("Transform removed all alleles\n") break else: sys.stdout.write("Transform result {}\n".format(alleles)) elif actiontype == 'location': if not actionfunc([int(x) for x in loc.split(':')]): linefail = True sys.stdout.write("Location Fail\n") break else: sys.stdout.write("Location Pass\n") if not linefail: if transformed: if linetype == 'full': alleles = encode_mvfstring(alleles) if alleles: test_output = "{}\t{}\n".format(loc, alleles) sys.stdout.write("Final output = {}\n".format( test_output)) else: sys.stdout.write("Transform removed all alleles\n") else: sys.stdout.write("No changes applied\n") sys.stdout.write("Final output = {}\n".format(args.test)) sys.exit() ## MAIN MODE ## Set up file handler outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite) outmvf.metadata = deepcopy(mvf.metadata) ### reprocess header if actions are used that filter columns if any(x == y[0] for x in ('columns', 'collapsepriority') for y in actionset): labels = outmvf.metadata['labels'][:] for actionname, actiontype, actionfunc, actionarg in actionset: if actionname == 'columns': labels = [labels[x] for x in actionarg] elif actionname == 'collapsepriority': labels = [labels[x] for x in xrange(len(labels)) if x not in actionarg[1:]] oldindicies = mvf.get_sample_indices(labels) newsamples = {} for i, _ in enumerate(labels): newsamples[i] = mvf.metadata['samples'][oldindicies[i]] outmvf.metadata['samples'] = newsamples.copy() outmvf.metadata['labels'] = labels[:] outmvf.write_data(outmvf.get_header()) ## End header editing linebuffer = [] nbuffer = 0 for chrom, pos, allelesets in mvf.iterentries(decode=False): linefail = False transformed = False #invar = invariant (single character) #refvar (all different than reference, two chars) #onecov (single coverage, + is second character) #onevar (one variable base, + is third character) #full = full alleles (all chars) alleles = allelesets[0] linetype = get_linetype(alleles) if linetype == 'empty': continue if args.verbose: sys.stdout.write(" {} {}".format(alleles, linetype)) for actionname, actiontype, actionfunc, actionarg in actionset: if actiontype == 'filter': if not actionfunc(alleles, linetype): linefail = True elif actiontype == 'transform': transformed = True alleles = actionfunc(alleles, linetype) linetype = get_linetype(alleles) if linetype == 'empty': linefail = True elif actiontype == 'location': if not actionfunc([chrom, pos]): linefail = True if linefail: break if not linefail: if transformed: if linetype == 'full': alleles = mvf.encode(alleles) if not alleles: linefail = True if not linefail: nbuffer += 1 linebuffer.append((chrom, pos, (alleles,))) if args.verbose: sys.stdout.write("{}\n".format(alleles)) if nbuffer == args.linebuffer: outmvf.write_entries(linebuffer) linebuffer = [] nbuffer = 0 elif args.verbose: sys.stdout.write("FAIL\n") if linebuffer: outmvf.write_entries(linebuffer) linebuffer = [] if not args.quiet: print("Completed in {} seconds".format(time() - time0)) return ''
def main(arguments=sys.argv[1:]): """Main MVF Chromoplot method""" pallette = Pallette() parser = argparse.ArgumentParser(description=""" Makes chromoplots from MVF format""") parser.add_argument("--mvf", help="Input MVF file", required=True) parser.add_argument("--outprefix", help="output prefix (not required)") parser.add_argument("--samples", nargs='*', required=True, help="3 or more taxa to use for quartets") parser.add_argument("--outgroup", nargs='*', required=True, help="1 or more outgroups to use for quartets") parser.add_argument("--windowsize", type=int, default=100000) parser.add_argument("--contigs", nargs='*', help="""order of contigs/chromosomes defaults to order present in MVF """) parser.add_argument("--majority", action="store_true", help="call majority pattern in each window") parser.add_argument("--infotrack", action="store_true", help="""additional coverage information track on the bottom""") parser.add_argument("--emptymask", choices=pallette.colornames, default="none", help="mask empty regions with color (default=none)") parser.add_argument("--yscale", default=20, type=int, help="number of pixels tall for each track") parser.add_argument("--xscale", default=1, type=int, help="number of pixels wide for each window") parser.add_argument("--colors", nargs=3, choices=pallette.colornames, help="three colors to use for chromoplot") parser.add_argument("--quiet", action="store_true", help="suppress progress meter") parser.add_argument("-v", "--version", action="store_true", help="display version information") args = parser.parse_args(args=arguments) if args.version: print("Version 2015-02-01: Initial Public Release") sys.exit() if args.colors: pallette.basecolors = args.colors ## Establish MVF and parse chromosome information mvf = MultiVariantFile(args.mvf, 'read') contignames = args.contigs or [] master_contigs = [] for contigname in contignames: contig_found = False for contigid in mvf.metadata['contigs']: if (contigname == contigid or contigname == mvf.metadata['contigs'][contigid]['label']): master_contigs.append(( contigid, mvf.metadata['contigs'][contigid]['label'], mvf.metadata['contigs'][contigid]['length'])) contig_found = True if contig_found: continue raise RuntimeError(contigname, "not found in MVF contig ids or labels") quartets = [(x, y, z, outgroup) for x, y, z in combinations(args.samples, 3) for outgroup in args.outgroup] ## Begin iterations for quartet in quartets: params = {'contigs': master_contigs[:], 'outpath': args.outprefix or '_'.join(quartet) + ".png", 'labels': quartet, 'windowsize': args.windowsize, 'majority': args.majority, 'infotrack': args.infotrack, 'quiet': args.quiet, 'yscale': args.yscale, 'xscale': args.xscale} chromoplot = Chromoplot(params=params, pallette=pallette) quartet_indices = mvf.get_sample_indices(labels=quartet) for contig, pos, allelesets in mvf.iterentries( subset=quartet_indices, decode=True, quiet=args.quiet, contigs=[x[0] for x in master_contigs]): alleles = allelesets[0] if '-' in alleles: site_code = 'gap' elif any(x not in 'ATGCatgc' for x in alleles): site_code = 'ambiguous' elif alleles[3] not in alleles[:3]: site_code = 'nonpolar' elif len(set(alleles)) > 2: site_code = 'triallelic' else: site_code = sum([2**(3-j) * (alleles[j] != alleles[3]) for j in xrange(3)]) chromoplot.add_data(contig, int(pos // args.windowsize), site_code) chromoplot.plot_chromoplot() chromoplot.write_total_log() return ''
def main(arguments=sys.argv[1:]): """Main MVF Treemaker""" parser = argparse.ArgumentParser( description=""" Process MVF into alignment""" ) parser.add_argument("--mvf", help="inputmvf") parser.add_argument("--out", help="tree list output file") parser.add_argument("--samples", nargs="*", help="one or more taxon labels, default=all") parser.add_argument("--raxml_outgroups", nargs="*", help="select outgroups to use in RAxML") parser.add_argument( "--rootwith", nargs="*", help="""root output trees with these taxa after RAxML""", ) parser.add_argument("--contigs", nargs="*", help="choose one or more contigs, default=all") parser.add_argument("--outputcontiglabels", action="store_true", help="output contig labels instead of ids") parser.add_argument("--outputempty", action="store_true", help="output entries of windows with no data") parser.add_argument( "--hapmode", default="none", choices=["none", "randomone", "randomboth", "major", "minor", "majorminor"], help="""haplotype splitting mode. 'none' = no splitting; 'randomone' = pick one allele randomly (recommended); 'randomboth = pick alleles randomly, keep both; 'major' = pick the more common allele; 'minor' = pick the less common allele; 'majorminor' = put the major in 'a' and minor in 'b' """, ) parser.add_argument( "--windowsize", type=int, default=10000, help="""specify genomic region size, or use -1 for whole contig""", ) parser.add_argument("--minsites", type=int, default=100, help="""minimum number of sites [100]""") parser.add_argument( "--minsitedepth", type=int, default=1, help="""mininum depth of sites to use in alignment [1]""", ) parser.add_argument( "--minseqcoverage", type=float, default=0.1, help="""proportion of total alignment a sequence must cover to be retianed [0.1]""", ) parser.add_argument("--mindepth", type=int, default=4, help="""minimum number of sequences [4]""") parser.add_argument( "--bootstrap", type=int, help="""turn on rapid bootstrapping for RAxML and perform specified number of replicates""", ) parser.add_argument("--raxml_model", default="GTRGAMMA", help="""choose custom RAxML model [GTRGAMMA]""") parser.add_argument("--raxmlpath", help="manually specify RAxML path") parser.add_argument("--raxmlopts", default="", help="specify additional RAxML arguments") parser.add_argument( "--duplicateseq", default="dontuse", choices=["dontuse", "keep", "remove"], help="""[dontuse] remove for tree making, replace as zero-branch-length sister taxa; keep=keep in for tree making, may cause errors for RAxML; remove=remove entirely from alignment""", ) parser.add_argument("--tempdir", default="raxmltemp", help="""temporary dir. location default=./tempdir""") parser.add_argument("--tempprefix", default="mvftree", help="""temporary file prefix, default=mvftree""") parser.add_argument("--quiet", action="store_true", help="suppress screen output") parser.add_argument("-v", "--version", action="store_true", help="display version information") args = parser.parse_args(args=arguments) if args.version: print("Version 2015-02-26") sys.exit() ## ESTABLISH FILE OBJECTS args.contigs = args.contigs or [] mvf = MultiVariantFile(args.mvf, "read") treefile = OutputFile( args.out, headers=[ "contig", "windowstart", "windowsize", "tree", "topology", "topoid", # 'templabels', ### USED FOR DEBUGGING ### "alignlength", "aligndepth", "status", ], ) topofile = OutputFile(args.out + ".counts", headers=["rank", "topology", "count"]) sample_cols = args.samples and mvf.get_sample_indices(args.samples) or [] if args.tempdir: tmpdir = os.path.abspath(args.tempdir) else: tmpdir = os.path.abspath("./raxmltemp") if not os.path.exists(tmpdir): os.mkdir(tmpdir) os.chdir(tmpdir) ## SETUP PARAMS main_labels = mvf.get_sample_labels(sample_cols) if args.hapmode in ["randomboth", "majorminor"]: main_labels = [label + x for x in ["a", "b"] for label in main_labels] params = { "outgroups": args.raxml_outgroups or [], "rootwith": args.rootwith or [], "minsites": args.minsites, "minseqcoverage": args.minseqcoverage, "mindepth": args.mindepth, "raxmlpath": args.raxmlpath, "raxmlopts": args.raxmlopts, "duplicateseq": args.duplicateseq, "model": args.raxml_model, "bootstrap": args.bootstrap, "windowsize": args.windowsize, "hapmode": args.hapmode, "tempdir": tmpdir, "tempprefix": args.tempprefix, } ## WINDOW START INTERATION current_contig = "" window_start = 0 window = None topo_ids = {} topo_counts = {} for contig, pos, allelesets in mvf.iterentries( contigs=args.contigs, subset=sample_cols, quiet=args.quiet, no_invariant=False, no_ambig=False, no_gap=False, decode=True, ): if contig != current_contig or (args.windowsize != -1 and (pos > window_start + args.windowsize)): if window: entry = window.maketree_raxml(params) if entry["status"] != "ok": if args.outputempty: treefile.write_entry(entry) else: topo = entry["topology"] topo_counts[topo] = topo_counts.get(topo, 0) + 1 if topo not in topo_ids: topo_ids[topo] = topo_ids and max(topo_ids.values()) + 1 or 0 entry["topoid"] = topo_ids[topo] treefile.write_entry(entry) window_start = ( (contig == current_contig and args.windowsize != -1) and window_start + args.windowsize or 0 ) current_contig = contig[:] window = None window = WindowData( window_params={ "contigname": ( args.outputcontiglabels and mvf.get_contig_label(current_contig) or current_contig[:] ), "windowstart": (args.windowsize == -1 and "-1" or window_start + 0), "windowsize": args.windowsize, "labels": main_labels[:], } ) ## ADD ALLELES if args.hapmode != "none": allelesets[0] = hapsplit(allelesets[0], args.hapmode) window.append_alleles(allelesets[0], minsitedepth=args.minsitedepth) ## LAST LOOP entry = window.maketree_raxml(params) if entry["status"] != "ok": if args.outputempty: treefile.write_entry(entry) else: topo = entry["topology"] topo_counts[topo] = topo_counts.get(topo, 0) + 1 if topo not in topo_ids: topo_ids[topo] = topo_ids and max(topo_ids.values()) + 1 or 0 entry["topoid"] = topo_ids[topo] treefile.write_entry(entry) window = None ## END WINDOW ITERATION topo_list = sorted([(v, k) for k, v in topo_counts.iteritems()], reverse=True) for rank, [value, topo] in enumerate(topo_list): topofile.write_entry({"rank": rank, "count": value, "topology": topo}) return ""
def main(arguments=sys.argv[1:]): """Main method for mvf2fasta""" parser = argparse.ArgumentParser(description=""" Process MVF into FASTA alignment""") parser.add_argument("--mvf", help="input MVF file", required=True) parser.add_argument("--out", help="target FASTA file", required=True) parser.add_argument("--labeltype", choices=['long', 'short'], default='long', help="long labels with all metadata or short ids") parser.add_argument("--regions", nargs='*', help="one or more regions id,start,stop (inclusive)") parser.add_argument("--samples", nargs='*', help="one or more taxon labels, leave blank for all") parser.add_argument("--outgroups", nargs="*") parser.add_argument("--contigs", nargs='*', help="one or more taxon labels, leave blank for all") parser.add_argument("--buffer", type=int, default=10, help="size (Mbp) of write buffer for each sample") parser.add_argument("--tmpdir", default=".", help="directory to write temporary fasta files") parser.add_argument("--quiet", action="store_true", default=True, help="suppress screen output") parser.add_argument("-v", "--version", action="store_true", help="display version information") args = parser.parse_args(args=arguments) if args.version: print("Version 2015-02-01: Initial Public Release") sys.exit() mvf = MultiVariantFile(args.mvf, 'read') if args.contigs: contigs = dict(mvf.metadata['contigs'][c] for c in args.contigs) else: contigs = dict(mvf.metadata['contigs']) sample_cols = mvf.get_sample_indices(args.samples or None) labels = mvf.get_sample_labels(sample_cols) current_contig = None tmp_files = dict((fn, open(fn+'.tmp', 'w+', args.buffer)) for fn in labels) for contig, _, allelesets in mvf.iterentries( contigs=args.contigs, subset=sample_cols, quiet=args.quiet, decode=True): alleles = mvf.decode(allelesets) if current_contig != contig: current_contig = contig for col, label in zip(sample_cols, labels): if args.labeltype == 'long': tmp_files[label].write( '\n>{} contig={} length={}\n{}'.format( label, contigs[current_contig]['label'], contigs[current_contig]['length'], alleles[col])) elif args.labeltype == 'short': tmp_files[label].write( '\n>{}_{}\n{}'.format( label, contigs[current_contig]['label'], alleles[col])) else: for col, label in zip(sample_cols, labels): tmp_files[label].write(alleles[col]) with open(args.out, 'w') as outfile: for filehandler in tmp_files.values(): filehandler.seek(0, 0) buff = filehandler.read(args.buffer) while len(buff): outfile.write(buff) buff = filehandler.read(args.buffer) filehandler.close() os.remove(os.path.join(args.tmpdir, filehandler.name)) return ''