Exemplo n.º 1
0
def main(arguments=None):
    """Main method"""
    arguments = sys.argv[1:] if arguments is None else arguments
    parser = generate_argparser()
    args = parser.parse_args(args=arguments)
    mvf = MultiVariantFile(args.mvf, 'read')
    flavor = mvf.metadata['flavor']
    if (flavor in ("dna", "rna") and args.outdata == "prot") or (
            flavor == "prot" and args.outdata in ("dna", "rna")):
        raise RuntimeError(
            "--outdata {} incompatiable with '{}' flavor mvf".format(
                args.outdata, flavor))
    sample_cols = mvf.get_sample_indices(args.samples or None)
    labels = mvf.get_sample_labels(sample_cols)
    current_contig = ''
    seqs = {}
    for contig, _, allelesets in mvf.iterentries(quiet=args.quiet,
                                                 decode=True):
        if contig != current_contig:
            if seqs:
                with open(
                        "{}.{}.fa".format(
                            args.outprefix,
                            mvf.metadata['contigs'][contig]['label']),
                        'wt') as outfile:
                    for seqname in sorted(seqs):
                        outfile.write(">{}\n{}\n".format(
                            seqname, ''.join(seqs[seqname])))
            seqs = None
            seqs = {}
            current_contig = contig[:]
        for col, label in zip(sample_cols, labels):
            if label not in seqs:
                seqs[label] = []
            if flavor in ('dna', 'rna'):
                seqs[label].append(allelesets[0][col] == 'X' and 'N'
                                   or allelesets[0][col])
            elif flavor in ('codon', 'prot') and (args.outdata == 'prot'):
                seqs[label].append(allelesets[0][col])
            elif flavor == 'codon' and args.outdata == 'dna':
                seqs[label].extend([
                    allelesets[x][col] == 'X' and 'N' or allelesets[x][col]
                    for x in (1, 2, 3)
                ])
    if seqs:
        with open(
                "{}.{}.fa".format(args.outprefix,
                                  mvf.metadata['contigs'][contig]['label']),
                'wt') as outfile:
            for seqname in sorted(seqs):
                outfile.write(">{}\n{}\n".format(seqname,
                                                 ''.join(seqs[seqname])))
            seqs = None
            seqs = {}
    return ''
Exemplo n.º 2
0
def main(arguments=None):
    """Main method"""
    arguments = sys.argv[1:] if arguments is None else arguments
    parser = generate_argparser()
    args = parser.parse_args(args=arguments)
    # HELP MENU
    if args.morehelp:
        modulehelp(MODULENAMES)
        sys.exit()
    # ESTABLISH MVF
    mvf = MultiVariantFile(args.mvf, 'read')
    # Argument Pre-processing
    if args.allele_groups:
        groups = {}
        for elem in args.allele_groups:
            elem = elem.split(':')
            groups[elem[0]] = mvf.get_sample_indices(labels=elem[1].split(','))
        args.allele_groups = groups.copy()
        for grp0, grp1 in combinations(groups, 2):
            if set(groups[grp0]) & set(groups[grp1]):
                raise RuntimeError("Groups contain same element",
                                   set(groups[grp0]) & set(groups[grp1]))
    if args.speciesgroups:
        groups = {}
        for elem in args.speciesgroups:
            elem = elem.split(':')
            groups[elem[0]] = mvf.get_sample_indices(labels=elem[1].split(','))
        args.speciesgroups = groups.copy()
        for specgroup in groups:
            ngroup = 0
            for allelegroup in args.allele_groups.values():
                if set(allelegroup) & set(groups[specgroup]):
                    ngroup += 1
                    if ngroup > 1:
                        raise RuntimeError(specgroup, "split across 2+ groups")
    # MODULES
    if args.module == 'Coverage':
        module = Coverage(params=vars(args))
    elif args.module == 'GroupUniqueAlleleWindow':
        module = GroupUniqueAlleleWindow(params=vars(args))
    elif args.module == 'PiDiversityWindow':
        module = PiDiversityWindow(params=vars(args))
    elif args.module == 'PairwiseNS':
        module = PairwiseNS(params=vars(args))
    # RUN MODULE
    module.analyze(mvf)

    return ''
Exemplo n.º 3
0
def main(arguments=sys.argv[1:]):
    """Main method for mvf_filter"""
    parser = argparse.ArgumentParser(description="""
    Filters and Transforms MVF files""")
    parser.add_argument("--mvf", help="input MVF file")
    parser.add_argument("--out", help="output MVF file")
    parser.add_argument("--actions", nargs='*',
                        help=("set of actions:args to perform,"
                              " note these are done in order as listed"))
    parser.add_argument("--test", help="manually input a line for testing")
    parser.add_argument("--testnchar", type=int,
                        help="total number of samples for test string")
    parser.add_argument("--modulehelp", action="store_true",
                        help="prints full module list and descriptions")
    parser.add_argument("--linebuffer", type=int, default=100000,
                        help="number of lines to write at once to MVF")
    parser.add_argument("--verbose", action="store_true",
                        help="report every line (for debugging)")
    parser.add_argument("--overwrite", action="store_true",
                        help="USE WITH CAUTION: force overwrite of outputs")
    parser.add_argument("--quiet", action="store_true",
                        help="suppress progress meter")
    parser.add_argument("-v", "--version", action="store_true",
                        help="display version information")
    args = parser.parse_args(args=arguments)
    if args.version:
        print("Version 2015-02-26")
        sys.exit()
    args = parser.parse_args(args=arguments)
    time0 = time()
    if args.modulehelp:
        modulehelp()
    if not args.mvf and not args.test:
        raise RuntimeError("No input file specified with --mvf")
    if not args.out and not args.test:
        raise RuntimeError("No output file specified with --outs")
    if not args.actions:
        raise RuntimeError("No --actions specified!")
    ## Establish Input MVF
    if args.test:
        ncol = args.testnchar or len(args.test)
    else:
        mvf = MultiVariantFile(args.mvf, 'read')
        ncol = mvf.metadata['ncol']
    ## Create Actionset
    actionset = build_actionset(args.actions, ncol)
    ##TESTING MODE
    if args.test:
        loc, alleles = args.test.split()
        linefail = False
        transformed = False
        #invar = invariant (single character)
        #refvar (all different than reference, two chars)
        #onecov (single coverage, + is second character)
        #onevar (one variable base, + is third character)
        #full = full alleles (all chars)
        if args.verbose:
            print(alleles)
        linetype = get_linetype(alleles)
        sys.stdout.write("MVF Encoding type '{}' detected\n".format(linetype))
        for actionname, actiontype, actionfunc, actionarg in actionset:
            sys.stdout.write("Applying action {} ({}): ".format(
                actionname, actiontype))
            if actiontype == 'filter':
                if not actionfunc(alleles, linetype):
                    linefail = True
                    sys.stdout.write("Filter Fail\n")
                    break
                else:
                    sys.stdout.write("Filter Pass\n")
            elif actiontype == 'transform':
                transformed = True
                alleles = actionfunc(alleles, linetype)
                linetype = get_linetype(alleles)
                if linetype == 'empty':
                    linefail = True
                    sys.stdout.write("Transform removed all alleles\n")
                    break
                else:
                    sys.stdout.write("Transform result {}\n".format(alleles))
            elif actiontype == 'location':
                if not actionfunc([int(x) for x in loc.split(':')]):
                    linefail = True
                    sys.stdout.write("Location Fail\n")
                    break
                else:
                    sys.stdout.write("Location Pass\n")
        if not linefail:
            if transformed:
                if linetype == 'full':
                    alleles = encode_mvfstring(alleles)
                if alleles:
                    test_output = "{}\t{}\n".format(loc, alleles)
                    sys.stdout.write("Final output = {}\n".format(
                        test_output))
                else:
                    sys.stdout.write("Transform removed all alleles\n")
            else:
                sys.stdout.write("No changes applied\n")
                sys.stdout.write("Final output = {}\n".format(args.test))
        sys.exit()
    ## MAIN MODE
    ## Set up file handler
    outmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    outmvf.metadata = deepcopy(mvf.metadata)
    ### reprocess header if actions are used that filter columns
    if any(x == y[0] for x in ('columns', 'collapsepriority')
           for y in actionset):
        labels = outmvf.metadata['labels'][:]
        for actionname, actiontype, actionfunc, actionarg in actionset:
            if actionname == 'columns':
                labels = [labels[x] for x in actionarg]
            elif actionname == 'collapsepriority':
                labels = [labels[x] for x in xrange(len(labels))
                          if x not in actionarg[1:]]
        oldindicies = mvf.get_sample_indices(labels)
        newsamples = {}
        for i, _ in enumerate(labels):
            newsamples[i] = mvf.metadata['samples'][oldindicies[i]]
        outmvf.metadata['samples'] = newsamples.copy()
        outmvf.metadata['labels'] = labels[:]
    outmvf.write_data(outmvf.get_header())
    ## End header editing
    linebuffer = []
    nbuffer = 0
    for chrom, pos, allelesets in mvf.iterentries(decode=False):
        linefail = False
        transformed = False
        #invar = invariant (single character)
        #refvar (all different than reference, two chars)
        #onecov (single coverage, + is second character)
        #onevar (one variable base, + is third character)
        #full = full alleles (all chars)
        alleles = allelesets[0]
        linetype = get_linetype(alleles)
        if linetype == 'empty':
            continue
        if args.verbose:
            sys.stdout.write(" {} {}".format(alleles, linetype))
        for actionname, actiontype, actionfunc, actionarg in actionset:
            if actiontype == 'filter':
                if not actionfunc(alleles, linetype):
                    linefail = True
            elif actiontype == 'transform':
                transformed = True
                alleles = actionfunc(alleles, linetype)
                linetype = get_linetype(alleles)
                if linetype == 'empty':
                    linefail = True
            elif actiontype == 'location':
                if not actionfunc([chrom, pos]):
                    linefail = True
            if linefail:
                break
        if not linefail:
            if transformed:
                if linetype == 'full':
                    alleles = mvf.encode(alleles)
                if not alleles:
                    linefail = True
        if not linefail:
            nbuffer += 1
            linebuffer.append((chrom, pos, (alleles,)))
            if args.verbose:
                sys.stdout.write("{}\n".format(alleles))
            if nbuffer == args.linebuffer:
                outmvf.write_entries(linebuffer)
                linebuffer = []
                nbuffer = 0
        elif args.verbose:
            sys.stdout.write("FAIL\n")
    if linebuffer:
        outmvf.write_entries(linebuffer)
        linebuffer = []
    if not args.quiet:
        print("Completed in {} seconds".format(time() - time0))
    return ''
Exemplo n.º 4
0
def main(arguments=sys.argv[1:]):
    """Main method for fasta2mvf"""
    parser = argparse.ArgumentParser(description="""
    Converts multisample-FASTA to MVF file with filtering """)
    parser.add_argument("--fasta", help="input FASTA file", required=True)
    parser.add_argument("--out", help="output MVF file", required=True)
    parser.add_argument("--contigids", nargs='*',
                        help=("""manually specify one or more contig ids
                                 as ID:NAME"""))
    parser.add_argument("--samplereplace", nargs="*",
                        help="""one or more TAG:NEWLABEL or TAG, items,
                                if TAG found in sample label, replace with
                                NEW (or TAG if NEW not specified)
                                NEW and TAG must each be unique""")
    parser.add_argument("--reflabel", default="REF",
                        help="label for reference sample (default='REF')")
    parser.add_argument("--allelesfrom", default=None,
                        help="""get additional alignment columns
                                from INFO fields (:-separated)""")
    parser.add_argument("--readbuffer", type=int, default=100000,
                        help="number of lines to hold in READ buffer")
    parser.add_argument("--writebuffer", type=int, default=100000,
                        help="number of lines to hold in WRITE buffer")
    parser.add_argument("--fieldsep", default="NONE",
                        choices=['TAB', 'SPACE', 'DBLSPACE',
                                 'COMMA', 'MIXED', 'PIPE'],
                        help="""FASTA field separator; assumes
                                '>database/SEP/accession/SEP/locus'
                                format (default='NONE')""")
    parser.add_argument("--contigfield", type=int,
                        help="""when headers are split by --fieldsep,
                        the 0-based index of the contig id""")
    parser.add_argument("--samplefield", type=int,
                        help="""when headers are split by --fieldsep,
                        the 0-based index of the sample id""")
    parser.add_argument("--overwrite", action="store_true",
                        help="USE WITH CAUTION: force overwrite of outputs")
    parser.add_argument("-v", "--version", action="store_true",
                        help="display version information")
    args = parser.parse_args(args=arguments)
    if args.version:
        print("Version 2015-07-07")
        sys.exit()
    sepchars = dict([("PIPE", "|"), ("TAB", "\t"),
                     ("SPACE", " "), ("DBLSPACE", "  "),
                     ("COMMA", ","), ("NONE", None)])
    args.fieldsep = sepchars[args.fieldsep]
    mvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    fasta = {}
    current_contig = 0
    fsamples = []
    fcontigs = []
    for header, seq in fasta_iter(args.fasta):
        header = header.split(args.fieldsep)
        if (len(header) < max(3, args.contigfield or 0, args.samplefield or 0)
                or args.contigfield is None or args.samplefield is None):
            contig = "UNK{}".format(current_contig)
            sample = header[0]
        else:
            contig = header[args.contigfield]
            sample = header[args.samplefield]
        if contig not in fcontigs:
            fcontigs.append(contig)
            fasta[contig] = {}
        if sample not in fsamples:
            fsamples.append(sample)
        fasta[contig][sample] = (len(seq), seq)
    reflabel = None
    if args.reflabel:
        for i, samplename in enumerate(fsamples):
            if args.reflabel in samplename:
                reflabel = i
                break
    if reflabel:
        newref = fsamples.pop(i)
        fsamples = [newref] + fsamples
    for i, contig in enumerate(fcontigs):
        mvf.metadata['contigs'][i] = {
            'label': contig,
            'length': max([fasta[contig][x][0] for x in fasta[contig]])}
    mvf.metadata['labels'] = fsamples[:]
    for i, label in enumerate(fsamples[:]):
        mvf.metadata['samples'][i] = {'label': label}
    mvf.metadata['ncol'] = len(mvf.metadata['labels'])
    mvf.metadata['sourceformat'] = 'fasta'
    # WRITE MVF HEADER
    mvf.write_data(mvf.get_header())
    mvfentries = []
    nentry = 0
    mvf_alleles = {}
    for cind, contig in enumerate(fcontigs):
        for pos in range(mvf.metadata['contigs'][cind]['length']):
            print(''.join(pos > fasta[contig][samp][0] and '-' or
                          fasta[contig][samp][1][pos]
                          for samp in fsamples))
            mvf_alleles = encode_mvfstring(
                ''.join(pos > fasta[contig][samp][0] and '-' or
                        fasta[contig][samp][1][pos]
                        for samp in fsamples))
            if mvf_alleles:
                mvfentries.append(
                    (cind, pos+1, (mvf_alleles,)))
                nentry += 1
                if nentry == args.writebuffer:
                    mvf.write_entries(mvfentries, encoded=True)
                    print(mvfentries[:5])
                    mvfentries = []
                    nentry = 0
    if mvfentries:
        mvf.write_entries(mvfentries)
        mvfentries = []
    return ''
Exemplo n.º 5
0
def main(arguments=sys.argv[1:]):
    """Main MVF Chromoplot method"""
    pallette = Pallette()
    parser = argparse.ArgumentParser(description="""
    Makes chromoplots from MVF format""")
    parser.add_argument("--mvf", help="Input MVF file", required=True)
    parser.add_argument("--outprefix", help="output prefix (not required)")
    parser.add_argument("--samples", nargs='*', required=True,
                        help="3 or more taxa to use for quartets")
    parser.add_argument("--outgroup", nargs='*', required=True,
                        help="1 or more outgroups to use for quartets")
    parser.add_argument("--windowsize", type=int, default=100000)
    parser.add_argument("--contigs", nargs='*',
                        help="""order of contigs/chromosomes
                                defaults to order present in MVF
                                """)
    parser.add_argument("--majority", action="store_true",
                        help="call majority pattern in each window")
    parser.add_argument("--infotrack", action="store_true",
                        help="""additional coverage information track
                                on the bottom""")
    parser.add_argument("--emptymask", choices=pallette.colornames,
                        default="none",
                        help="mask empty regions with color (default=none)")
    parser.add_argument("--yscale", default=20, type=int,
                        help="number of pixels tall for each track")
    parser.add_argument("--xscale", default=1, type=int,
                        help="number of pixels wide for each window")
    parser.add_argument("--colors", nargs=3, choices=pallette.colornames,
                        help="three colors to use for chromoplot")
    parser.add_argument("--quiet", action="store_true",
                        help="suppress progress meter")
    parser.add_argument("-v", "--version", action="store_true",
                        help="display version information")
    args = parser.parse_args(args=arguments)
    if args.version:
        print("Version 2015-02-01: Initial Public Release")
        sys.exit()
    if args.colors:
        pallette.basecolors = args.colors
    ## Establish MVF and parse chromosome information
    mvf = MultiVariantFile(args.mvf, 'read')
    contignames = args.contigs or []
    master_contigs = []
    for contigname in contignames:
        contig_found = False
        for contigid in mvf.metadata['contigs']:
            if (contigname == contigid or
                    contigname == mvf.metadata['contigs'][contigid]['label']):
                master_contigs.append((
                    contigid, mvf.metadata['contigs'][contigid]['label'],
                    mvf.metadata['contigs'][contigid]['length']))
                contig_found = True
        if contig_found:
            continue
        raise RuntimeError(contigname, "not found in MVF contig ids or labels")
    quartets = [(x, y, z, outgroup) for x, y, z in
                combinations(args.samples, 3) for outgroup in args.outgroup]
    ## Begin iterations
    for quartet in quartets:
        params = {'contigs': master_contigs[:],
                  'outpath': args.outprefix or '_'.join(quartet) + ".png",
                  'labels': quartet,
                  'windowsize': args.windowsize,
                  'majority': args.majority,
                  'infotrack': args.infotrack,
                  'quiet': args.quiet,
                  'yscale': args.yscale,
                  'xscale': args.xscale}
        chromoplot = Chromoplot(params=params, pallette=pallette)
        quartet_indices = mvf.get_sample_indices(labels=quartet)
        for contig, pos, allelesets in mvf.iterentries(
                subset=quartet_indices, decode=True,
                quiet=args.quiet, contigs=[x[0] for x in master_contigs]):
            alleles = allelesets[0]
            if '-' in alleles:
                site_code = 'gap'
            elif any(x not in 'ATGCatgc' for x in alleles):
                site_code = 'ambiguous'
            elif alleles[3] not in alleles[:3]:
                site_code = 'nonpolar'
            elif len(set(alleles)) > 2:
                site_code = 'triallelic'
            else:
                site_code = sum([2**(3-j) * (alleles[j] != alleles[3])
                                 for j in xrange(3)])
            chromoplot.add_data(contig, int(pos // args.windowsize), site_code)
        chromoplot.plot_chromoplot()
        chromoplot.write_total_log()
    return ''
Exemplo n.º 6
0
def main(arguments=sys.argv[1:]):
    """Main method for vcf2mvf"""
    parser = argparse.ArgumentParser(
        description="""
    Converts multisample-VCF to MVF file with filtering """
    )
    parser.add_argument("--vcf", help="input VCF file", required=True)
    parser.add_argument("--out", help="output MVF file", required=True)
    parser.add_argument("--maskdepth", type=int, default=1, help="below this depth mask with N/n")
    parser.add_argument(
        "--lowdepth",
        type=int,
        default=3,
        help="""below this depth convert to lower case
                              set to 0 to disable""",
    )
    parser.add_argument(
        "--maskqual",
        type=int,
        default=3,
        help="""low quality cutoff, bases replaced by N/-
                             set to 0 to disable""",
    )
    parser.add_argument(
        "--lowqual",
        type=int,
        default=20,
        help="""below this quality convert to lower case
                                set to 0 to disable""",
    )
    parser.add_argument(
        "--contigids",
        nargs="*",
        help=(
            """manually specify one or more contig ids
                                 as ID:NAME"""
        ),
    )
    parser.add_argument(
        "--samplereplace",
        nargs="*",
        help="""one or more TAG:NEWLABEL or TAG, items,
                                if TAG found in sample label, replace with
                                NEW (or TAG if NEW not specified)
                                NEW and TAG must each be unique""",
    )
    parser.add_argument("--reflabel", default="REF", help="label for reference sample (default='REF')")
    parser.add_argument(
        "--allelesfrom",
        default=None,
        help="""get additional alignment columns
                                from INFO fields (:-separated)""",
    )
    parser.add_argument("--linebuffer", type=int, default=100000, help="number of lines to hold in read/write buffer")
    parser.add_argument("--no_autoindex", action="store_true", help="do not automatically index contigs from the VCF")
    parser.add_argument(
        "--fieldsep",
        default="TAB",
        choices=["TAB", "SPACE", "DBLSPACE", "COMMA", "MIXED"],
        help="""VCF field separator (default='TAB')""",
    )
    parser.add_argument("--overwrite", action="store_true", help="USE WITH CAUTION: force overwrite of outputs")
    parser.add_argument("--quiet", action="store_true", help="suppress progress meter")
    parser.add_argument("-v", "--version", action="store_true", help="display version information")
    args = parser.parse_args(args=arguments)
    if args.version:
        print("Version 2015-02-26")
        sys.exit()
    sepchars = dict([("TAB", "\t"), ("SPACE", " "), ("DBLSPACE", "  "), ("COMMA", ","), ("MIXED", None)])
    args.fieldsep = sepchars[args.fieldsep]
    ## ESTABLISH VCF
    vcf = VariantCallFile(args.vcf, indexcontigs=(not args.no_autoindex))
    ## ESTABLISH MVF
    mvf = MultiVariantFile(args.out, "write", overwrite=args.overwrite)
    # PROCESS CONTIG INFO
    contigs = vcf.metadata["contigs"].copy()
    maxcontigid = 0
    newids = set([])
    if args.contigids:
        for cid, cname in (x.split(":") for x in args.contigids):
            for tempid in contigs:
                if cname in contigs[tempid]["label"]:
                    try:
                        cid = int(cid)
                    except ValueError:
                        pass
                    mvf.metadata["contigs"][cid] = contigs[tempid].copy()
                    del contigs[tempid]
                    newids.update([cid])
                    break
        for cid in newids:
            try:
                maxcontigid = max([maxcontigid, int(cid) + 1])
            except ValueError:
                continue
    tempids = set(contigs.keys()) - newids
    for tempid, newid in sorted(zip(tempids, xrange(maxcontigid, maxcontigid + len(tempids)))):
        mvf.metadata["contigs"][newid] = vcf.metadata["contigs"][tempid]
    contig_translate = dict([(mvf.metadata["contigs"][x]["label"], x) for x in mvf.metadata["contigs"]])
    # PROCESS SAMPLE INFO
    samplelabels = [args.reflabel] + vcf.metadata["samples"][:]
    if args.allelesfrom:
        args.allelesfrom = args.allelesfrom.split(":")
        samplelabels += args.allelesfrom
    if args.samplereplace:
        newsample = [":" in tuple(x) and x.split(":") or tuple([x, x]) for x in args.samplereplace]
        unmatched = [x for x in enumerate(samplelabels)]
        for old, new in newsample:
            labelmatched = False
            for j, (i, name) in enumerate(unmatched):
                if old in name:
                    samplelabels[i] = new
                    labelmatched = j
                    break
            if labelmatched != False:
                del unmatched[labelmatched]
    mvf.metadata["labels"] = samplelabels[:]
    for i, label in enumerate(samplelabels):
        mvf.metadata["samples"][i] = {"label": label}
    mvf.metadata["ncol"] = len(mvf.metadata["labels"])
    mvf.metadata["sourceformat"] = vcf.metadata["sourceformat"]
    ## WRITE MVF HEADER
    mvf.write_data(mvf.get_header())
    mvfentries = []
    nentry = 0
    for vcfrecord in vcf.iterentries(vars(args)):
        mvf_alleles = encode_mvfstring("".join(vcfrecord["genotypes"]))
        if mvf_alleles:
            mvfentries.append(
                (contig_translate.get(vcfrecord["contig"], vcfrecord["contig"]), vcfrecord["coord"], (mvf_alleles,))
            )
            nentry += 1
            if nentry == args.linebuffer:
                mvf.write_entries(mvfentries, encoded=True)
                mvfentries = []
                nentry = 0
    if mvfentries:
        mvf.write_entries(mvfentries)
        mvfentries = []
    return ""
Exemplo n.º 7
0
def main(arguments=sys.argv[1:]):
    """Main MVF Treemaker"""
    parser = argparse.ArgumentParser(
        description="""
    Process MVF into alignment"""
    )
    parser.add_argument("--mvf", help="inputmvf")
    parser.add_argument("--out", help="tree list output file")
    parser.add_argument("--samples", nargs="*", help="one or more taxon labels, default=all")
    parser.add_argument("--raxml_outgroups", nargs="*", help="select outgroups to use in RAxML")
    parser.add_argument(
        "--rootwith",
        nargs="*",
        help="""root output trees with
                                these taxa after RAxML""",
    )
    parser.add_argument("--contigs", nargs="*", help="choose one or more contigs, default=all")
    parser.add_argument("--outputcontiglabels", action="store_true", help="output contig labels instead of ids")
    parser.add_argument("--outputempty", action="store_true", help="output entries of windows with no data")
    parser.add_argument(
        "--hapmode",
        default="none",
        choices=["none", "randomone", "randomboth", "major", "minor", "majorminor"],
        help="""haplotype splitting mode.
                                'none' = no splitting;
                                'randomone' = pick one allele randomly
                                              (recommended);
                                'randomboth = pick alleles randomly,
                                              keep both;
                                'major' = pick the more common allele;
                                'minor' = pick the less common allele;
                                'majorminor' = put the major in 'a' and
                                               minor in 'b'
                            """,
    )
    parser.add_argument(
        "--windowsize",
        type=int,
        default=10000,
        help="""specify genomic region size,
                                or use -1 for whole contig""",
    )
    parser.add_argument("--minsites", type=int, default=100, help="""minimum number of sites [100]""")
    parser.add_argument(
        "--minsitedepth",
        type=int,
        default=1,
        help="""mininum depth of sites to use in alignment
                                [1]""",
    )
    parser.add_argument(
        "--minseqcoverage",
        type=float,
        default=0.1,
        help="""proportion of total alignment a sequence
                                must cover to be retianed [0.1]""",
    )
    parser.add_argument("--mindepth", type=int, default=4, help="""minimum number of sequences [4]""")
    parser.add_argument(
        "--bootstrap",
        type=int,
        help="""turn on rapid bootstrapping for RAxML and
                             perform specified number of replicates""",
    )
    parser.add_argument("--raxml_model", default="GTRGAMMA", help="""choose custom RAxML model [GTRGAMMA]""")
    parser.add_argument("--raxmlpath", help="manually specify RAxML path")
    parser.add_argument("--raxmlopts", default="", help="specify additional RAxML arguments")
    parser.add_argument(
        "--duplicateseq",
        default="dontuse",
        choices=["dontuse", "keep", "remove"],
        help="""[dontuse] remove for tree making,
                                replace as zero-branch-length sister taxa;
                                keep=keep in for tree making,
                                may cause errors for RAxML;
                                remove=remove entirely from alignment""",
    )
    parser.add_argument("--tempdir", default="raxmltemp", help="""temporary dir. location default=./tempdir""")
    parser.add_argument("--tempprefix", default="mvftree", help="""temporary file prefix, default=mvftree""")
    parser.add_argument("--quiet", action="store_true", help="suppress screen output")
    parser.add_argument("-v", "--version", action="store_true", help="display version information")

    args = parser.parse_args(args=arguments)
    if args.version:
        print("Version 2015-02-26")
        sys.exit()
    ## ESTABLISH FILE OBJECTS
    args.contigs = args.contigs or []
    mvf = MultiVariantFile(args.mvf, "read")
    treefile = OutputFile(
        args.out,
        headers=[
            "contig",
            "windowstart",
            "windowsize",
            "tree",
            "topology",
            "topoid",
            # 'templabels', ### USED FOR DEBUGGING ###
            "alignlength",
            "aligndepth",
            "status",
        ],
    )
    topofile = OutputFile(args.out + ".counts", headers=["rank", "topology", "count"])
    sample_cols = args.samples and mvf.get_sample_indices(args.samples) or []
    if args.tempdir:
        tmpdir = os.path.abspath(args.tempdir)
    else:
        tmpdir = os.path.abspath("./raxmltemp")
    if not os.path.exists(tmpdir):
        os.mkdir(tmpdir)
    os.chdir(tmpdir)
    ## SETUP PARAMS
    main_labels = mvf.get_sample_labels(sample_cols)
    if args.hapmode in ["randomboth", "majorminor"]:
        main_labels = [label + x for x in ["a", "b"] for label in main_labels]
    params = {
        "outgroups": args.raxml_outgroups or [],
        "rootwith": args.rootwith or [],
        "minsites": args.minsites,
        "minseqcoverage": args.minseqcoverage,
        "mindepth": args.mindepth,
        "raxmlpath": args.raxmlpath,
        "raxmlopts": args.raxmlopts,
        "duplicateseq": args.duplicateseq,
        "model": args.raxml_model,
        "bootstrap": args.bootstrap,
        "windowsize": args.windowsize,
        "hapmode": args.hapmode,
        "tempdir": tmpdir,
        "tempprefix": args.tempprefix,
    }
    ## WINDOW START INTERATION
    current_contig = ""
    window_start = 0
    window = None
    topo_ids = {}
    topo_counts = {}
    for contig, pos, allelesets in mvf.iterentries(
        contigs=args.contigs,
        subset=sample_cols,
        quiet=args.quiet,
        no_invariant=False,
        no_ambig=False,
        no_gap=False,
        decode=True,
    ):
        if contig != current_contig or (args.windowsize != -1 and (pos > window_start + args.windowsize)):
            if window:
                entry = window.maketree_raxml(params)
                if entry["status"] != "ok":
                    if args.outputempty:
                        treefile.write_entry(entry)
                else:
                    topo = entry["topology"]
                    topo_counts[topo] = topo_counts.get(topo, 0) + 1
                    if topo not in topo_ids:
                        topo_ids[topo] = topo_ids and max(topo_ids.values()) + 1 or 0
                    entry["topoid"] = topo_ids[topo]
                    treefile.write_entry(entry)
                window_start = (
                    (contig == current_contig and args.windowsize != -1) and window_start + args.windowsize or 0
                )
            current_contig = contig[:]
            window = None
            window = WindowData(
                window_params={
                    "contigname": (
                        args.outputcontiglabels and mvf.get_contig_label(current_contig) or current_contig[:]
                    ),
                    "windowstart": (args.windowsize == -1 and "-1" or window_start + 0),
                    "windowsize": args.windowsize,
                    "labels": main_labels[:],
                }
            )
        ## ADD ALLELES
        if args.hapmode != "none":
            allelesets[0] = hapsplit(allelesets[0], args.hapmode)
        window.append_alleles(allelesets[0], minsitedepth=args.minsitedepth)
    ## LAST LOOP
    entry = window.maketree_raxml(params)
    if entry["status"] != "ok":
        if args.outputempty:
            treefile.write_entry(entry)
    else:
        topo = entry["topology"]
        topo_counts[topo] = topo_counts.get(topo, 0) + 1
        if topo not in topo_ids:
            topo_ids[topo] = topo_ids and max(topo_ids.values()) + 1 or 0
        entry["topoid"] = topo_ids[topo]
        treefile.write_entry(entry)
    window = None
    ## END WINDOW ITERATION
    topo_list = sorted([(v, k) for k, v in topo_counts.iteritems()], reverse=True)
    for rank, [value, topo] in enumerate(topo_list):
        topofile.write_entry({"rank": rank, "count": value, "topology": topo})
    return ""
Exemplo n.º 8
0
import sys
from mvfbase import MultiVariantFile

mvf = MultiVariantFile(sys.argv[1], 'read')

#for entry in mvf.iterentries(contig_ids=['57881']):
#    print(entry)

print(mvf.get_header())
Exemplo n.º 9
0
def main(arguments=sys.argv[1:]):
    """Main method for mvf_join"""
    parser = argparse.ArgumentParser(description="""
        MVF joining both veritically (separate contigs) and
        and horizontally (different samples)""")
    parser.add_argument("mvf", nargs="*", help="one or more mvf files")
    parser.add_argument("--out", help="output mvf file")
    parser.add_argument("--newcontigs", action="store_true",
                        help="Don't match contigs using labels (not IDs)")
    parser.add_argument("--newsamples", action="store_true",
                        help="Don't match samples using labels")
    parser.add_argument("--linebuffer", type=int, default=100000,
                        help="number of entries to write in a block")
    parser.add_argument("--main_header_file",
                        help="""name of MVF file to use the headers from
                                (default=first in list)""")
    parser.add_argument("--overwrite", action="store_true",
                        help="USE WITH CAUTION: force overwrite of outputs")
    parser.add_argument("--quiet", action="store_true",
                        help="suppress progress meter")
    parser.add_argument("-v", "--version", action="store_true",
                        help="display version information")
    args = parser.parse_args(args=arguments)
    if args.version:
        print("Version 2015-02-01: Initial Public Release")
        sys.exit()
    concatmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    ## Copy the first file's metadata
    if args.main_header_file:
        if args.main_header_file not in args.mvf:
            raise RuntimeError("{} not found in files".format(
                args.main_header_file))
        else:
            args.main_header_file = args.mvf.index(args.main_header_file)
    else:
        args.main_header_file = 0
    first_mvf = MultiVariantFile(args.mvf[args.main_header_file], 'read')
    concatmvf.metadata = first_mvf.metadata.copy()
    ## Open each MVF file, read headers to make unified header
    transformers = []
    for mvfname in args.mvf:
        ## This will create a dictionary of samples{old:new}, contigs{old:new}
        transformer = MvfTransformer()
        mvf = MultiVariantFile(mvfname, 'read')
        for i, label in enumerate(mvf.get_sample_labels()):
            if label not in concatmvf.get_sample_labels():
                concatmvf.metadata['labels'].append(label)
                concatmvf.metadata['samples'][
                    concatmvf.metadata['labels'].index(label)] = {
                        'label': label}
            if concatmvf.metadata['labels'].index(label) != i:
                transformer.set_label(
                    i, concatmvf.metadata['labels'].index(label))
        for contigid, contigdata in mvf.metadata['contigs'].iteritems():
            if contigdata['label'] not in [
                    concatmvf.metadata['contigs'][x]['label']
                    for x in concatmvf.metadata['contigs']]:
                newid = (contigid not in concatmvf.metadata['contigs']
                         and contigid or concatmvf.get_next_contig_id())
                concatmvf.metadata['contigs'][newid] = contigdata
            else:
                for concatid, concatdata in (
                        concatmvf.metadata['contigs'].items()):
                    if contigdata['label'] == concatdata['label']:
                        newid = concatid
                        break
            if newid != contigid:
                transformer.set_contig(contigid, newid)
        transformers.append(transformer)
    ## Write output header
    concatmvf.write_data(concatmvf.get_header())
    ## Now loop through each file
    entries = []
    nentries = 0
    for ifile, mvfname in enumerate(args.mvf):
        if not args.quiet:
            sys.stderr.write("Processing {} ...\n".format(mvfname))
        transformer = transformers[ifile]
        mvf = MultiVariantFile(mvfname, 'read')
        for contigid, pos, allelesets in mvf.iterentries(decode=False,
                                                         quiet=args.quiet):
            if transformer.labels:
                allelesets = [mvf.decode(x) for x in allelesets]
                for j, alleles in enumerate(allelesets):
                    allelesets[j] = concatmvf.encode(''.join([
                        x in transformer.labels
                        and alleles[transformer.labels[x]] or alleles[x]
                        for x in xrange(len(alleles))]))
            if transformer.contigs:
                contigid = (contigid in transformer['contigs']
                            and transformer['contigs'][contigid]
                            or contigid)
            entries.append((contigid, pos, allelesets))
            nentries += 1
            if nentries == args.linebuffer:
                concatmvf.write_entries(entries)
                entries = []
                nentries = 0
        if entries:
            concatmvf.write_entries(entries)
            entries = []
            nentries = 0
        if not args.quiet:
            sys.stderr.write("done\n")
    return ''
Exemplo n.º 10
0
def calc_pairwise_dnds(args):
    """Calculates Pairwise dNdS using PAML among pairse of sequences
       """
    mvf = MultiVariantFile(args.mvf, 'read')
    annotations = {}
    coordinates = {}
    if args.gff:
        annotations, coordinates = (parse_gff_annotate(args.gff))
    labels = mvf.get_sample_labels()[:]
    ncol = len(labels)
    current_contig = None
    current_position = 0
    counts = Counter()
    totals = Counter()
    if self.params['output_align']:
        outputalign = []
    fieldtags = [
        'likelihood', 'bgdnds0', 'bgdnds1', 'bgdnds2a', 'bgdnds2b', 'fgdnds0',
        'fgdnds1', 'fgdnds2a', 'fgdnds2b', 'dndstree', 'errorstate'
    ]
    with open(self.params['branchlrt'], 'w') as branchlrt:
        genealign = []
        branchlrt.write("\t".join(
            ['contig', 'ntaxa', 'alignlength', 'lrtscore'] +
            ["null.{}".format(x)
             for x in fieldtags] + ["test.{}".format(x)
                                    for x in fieldtags] + ['tree']) + "\n")
    groups = self.params['allele_groups'].values()
    speciesgroups = self.params['speciesgroups'].values()
    allsets = set([])
    for group in groups:
        allsets.update(group)
    allsets = list(sorted(allsets))
    speciesrev = {}
    for species in self.params['speciesgroups']:
        speciesrev.update([(x, species)
                           for x in self.params['speciesgroups'][species]])
    if self.params['mincoverage']:
        if self.params['mincoverage'] < len(groups) * 2:
            raise RuntimeError("""
                Error: GroupUniqueAlleleWindow:
                --mincoverage cannot be lower than the twice the number
                of specified groups in --allele-groups
                """)
    for contig, pos, allelesets in mvf:
        if not current_contig:
            current_contig = contig[:]
        if contig != current_contig or (
                self.params['windowsize'] != -1
                and pos > current_position + self.params['windowsize']):
            xkey = (
                current_contig,
                current_position,
            )
            self.data[xkey] = counts.copy()
            self.data[xkey].update([
                ('contig', (self.params['uselabels']
                            and mvf.get_contig_label(current_contig))),
                ('position', current_position),
                ('nonsynyonymous_changes',
                 counts.get('nonsynonymous_changes', 0) or 0),
                ('synyonymous_changes', counts.get('synonymous_changes', 0)
                 or 0)
            ])
            self.data[xkey].update([
                ('ns_ratio',
                 (float(self.data[xkey].get('nonsynonymous_changes', 0)) /
                  (self.data[xkey].get('synonymous_changes', 1.0)))),
                ('annotation', annotations.get(self.data[xkey]['contig'],
                                               '.')),
                ('coordinates', coordinates.get(self.data[xkey]['contig'],
                                                '.'))
            ])
            if genealign:
                if (self.params.get('endcontig', 1000000) >=
                        int(current_contig)) and (self.params.get(
                            'startcontig', 0) <= int(current_contig)):
                    # print(current_contig)
                    (dnval, dsval) = paml_pwcalc_dnds(genealign)
                    with open(self.params['branchlrt'], 'a') as branchlrt:
                        branchlrt.write("\t".join([
                            str(x) for x in [
                                self.data[xkey]['contig'],
                                len(genealign),
                                len(genealign[0]) * 3, dnval, dsval
                            ]
                        ]) + "\n")
            genealign = None
            totals.add('genes_total')
            if counts.get('total_codons', 0) > 0:
                totals.add('genes_tested')
            if counts.get('total_nsyn_codons', 0) > 0:
                totals.add('genes_with_nsyn')
            if contig != current_contig:
                current_contig = contig[:]
                current_position = 0
            elif self.params['windowsize'] != -1:
                current_position += self.params['windowsize']
            counts = Counter()
        proteins = allelesets[0]
        codons = allelesets[1:4]
        if len(proteins) == 1 and all(len(x) == 1 for x in codons):
            if proteins == '*' or ''.join(codons) in MLIB.stop_codons:
                continue
            counts.add('total_codons')
            totals.add('total_codons')
            if self.params['output_align']:
                if not outputalign:
                    outputalign = [[''.join(codons)]
                                   for x in range(mvf.metadata['ncol'])]
                else:
                    for ialign in range(len(outputalign)):
                        outputalign[ialign].append(''.join(codons))
            if self.params['branchlrt']:
                if not genealign:
                    genealign = [[''.join(codons)] for x in range(ncol)]
                else:
                    for ialign in range(len(genealign)):
                        genealign[ialign].append(''.join(codons))
            continue
        if len(proteins) > 1:
            if allelesets[0][1] == '+':
                continue
        proteins = mvf.decode(proteins)
        if self.params['mincoverage']:
            if sum([int(x not in 'X-')
                    for x in proteins]) < (self.params['mincoverage']):
                continue
        species_groups = [[proteins[i] for i in x if proteins[i] not in '-X']
                          for x in speciesgroups]
        if any(len(x) == 0 for x in species_groups):
            continue
        xcodons = [mvf.decode(x) for x in codons]
        codons = [''.join(x) for x in zip(*xcodons)]
        if any(codons[x] in MLIB.stop_codons for x in allsets):
            continue
        if any(
                any(x != species_groups[0][0] for x in y)
                for y in species_groups):
            totals.add('total_nsyn_codons')
            counts.add('total_nsyn_codons')
        totals.add('total_codons')
        totals.add('tested_codons')
        counts.add('total_codons')
        totals.add('variable_codons',
                   val=int(
                       sum([int(len(set(x) - set('X-')) > 1)
                            for x in xcodons]) > 0))
        if self.params['output_align']:
            if not outputalign:
                outputalign = [[x] for x in codons]
            else:
                for ialign in range(len(outputalign)):
                    outputalign[ialign].append(codons[ialign])
        if self.params['branchlrt']:
            if not genealign:
                genealign = [[x] for x in codons]
            else:
                for ialign in range(len(codons)):
                    genealign[ialign].append(codons[ialign])
        nonsyn_change = False
        synon_change = False
        codon_groups = [
            set([
                codons[i] for i in x
                if '-' not in codons[i] and 'X' not in codons[i]
            ]) for x in groups
        ]
        protein_groups = None
        for i in range(len(codon_groups)):
            if any(base in codon for base in 'RYWKMS'
                   for codon in codon_groups[i]):
                codon_groups[i] = hapgroup(codon_groups[i])
        if all(
                grp1.isdisjoint(grp0)
                for grp0, grp1 in combinations(codon_groups, 2)):
            protein_groups = [
                set([
                    MLIB.codon_table['full'][''.join(x)]
                    for x in codon_groups[i]
                ]) for i in range(len(codon_groups))
            ]
            if all(
                    grp1.isdisjoint(grp0)
                    for grp0, grp1 in combinations(protein_groups, 2)):
                nonsyn_change = True
            elif all(grp1 == grp0
                     for grp0, grp1 in combinations(protein_groups, 2)):
                synon_change = True
        if nonsyn_change:
            print('NON', contig, pos, allelesets, codon_groups, protein_groups,
                  groups, mvf.get_contig_label(contig))
            counts.add('nonsynonymous_changes')
            totals.add('nonsynonymous_changes')
        elif synon_change:
            print('SYN', contig, pos, allelesets, codon_groups, protein_groups,
                  groups, mvf.get_contig_label(contig))
            counts.add('synonymous_changes')
            totals.add('synonymous_changes')
    self.params['totals'] = totals
    self.write()
    if self.params['output_align']:
        with open(self.params['output_align'], 'w') as alignfile:
            alignfile.write("\n".join([
                ">{}\n{}".format(mvf.metadata['labels'][i],
                                 ''.join(outputalign[i]))
                for i in range(len(outputalign))
            ]))
    return ''
Exemplo n.º 11
0
def main(arguments=sys.argv[1:]):
    """Main method for geno2mvf"""
    parser = argparse.ArgumentParser(description="""
    Converts GATK Genotype Format to MVF file with some filters """)
    parser.add_argument("--geno", help="input .geno file", required=True)
    parser.add_argument("--out", help="output MVF file", required=True)
    parser.add_argument("--contigids", nargs='*',
                        help=("manually specify one or more contig ids"
                              " as ID:NAME"))
    parser.add_argument("--samplereplace", nargs="*",
                        help="""one or more TAG:NEWLABEL or TAG, items,
                                if TAG found in sample label, replace with
                                NEW (or TAG if NEW not specified)
                                NEW and TAG must each be unique""")
    parser.add_argument("--reflabel", default="REF",
                        help="""label of the reference sample
                                (default is first entry)""")
    parser.add_argument("--no_autoindex", action="store_true",
                        help="do not automatically index contigs")
    parser.add_argument("--fieldsep", default="SPACE",
                        choices=['TAB', 'SPACE', 'DBLSPACE', 'COMMA', 'MIXED'],
                        help="""entry field separator (default='SPACE')""")
    parser.add_argument("--linebuffer", type=int, default=100000,
                        help="number of lines to hold in read/write buffer")
    parser.add_argument("--overwrite", action="store_true",
                        help="USE WITH CAUTION: force overwrite of outputs")
    parser.add_argument("--quiet", action="store_true",
                        help="suppress progress meter")
    parser.add_argument("-v", "--version", action="store_true",
                        help="display version information")
    args = parser.parse_args(args=arguments)
    if args.version:
        print("Version 2015-02-01: Initial Public Release")
        sys.exit()
    sepchars = dict([("TAB", "\t"), ("SPACE", " "), ("DBLSPACE", "  "),
                     ("COMMA", ","), ("MIXED", None)])
    args.fieldsep = sepchars[args.fieldsep]
    ## ESTABLISH GENO
    geno = GenoFile(args.geno, indexcontigs=(not args.no_autoindex))
    ## ESTABLISH MVF
    mvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    # PROCESS CONTIG INFO
    contigs = geno.metadata['contigs'].copy()
    maxcontigid = 0
    newids = set([])
    if args.contigids:
        for cid, cname in (x.split(':') for x in args.contigids):
            for tempid in contigs:
                if cname in contigs[tempid]['label']:
                    try:
                        cid = int(cid)
                    except ValueError:
                        pass
                    mvf.metadata['contigs'][cid] = contigs[tempid].copy()
                    del contigs[tempid]
                    newids.update([cid])
                    break
        for cid in newids:
            try:
                maxcontigid = max([maxcontigid, int(cid) + 1])
            except ValueError:
                continue
    tempids = set(contigs.keys()) - newids
    for tempid, newid in sorted(zip(
            tempids, xrange(maxcontigid, maxcontigid + len(tempids)))):
        mvf.metadata['contigs'][newid] = geno.metadata['contigs'][tempid]
    contig_translate = dict([(mvf.metadata['contigs'][x]['label'], x)
                             for x in mvf.metadata['contigs']])
    # PROCESS SAMPLE INFO
    samplelabels = geno.metadata['samples'][:]
    if args.samplereplace:
        newsample = [':' in tuple(x) and x.split(':') or tuple([x, x])
                     for x in args.samplereplace]
        unmatched = [x for x in enumerate(samplelabels)]
        for old, new in newsample:
            labelmatched = False
            for j, (i, name) in enumerate(unmatched):
                if old in name:
                    samplelabels[i] = new
                    labelmatched = j
                    break
            if labelmatched != False:
                del unmatched[labelmatched]
    mvf.metadata['labels'] = samplelabels[:]
    for i, label in enumerate(samplelabels):
        mvf.metadata['samples'][i] = {'label': label}
    mvf.metadata['ncol'] = len(mvf.metadata['labels'])
    mvf.metadata['sourceformat'] = geno.metadata['sourceformat']
    ## WRITE MVF HEADER
    mvf.write_data(mvf.get_header())
    mvfentries = []
    nentry = 0
    for record in geno.iterentries(vars(args)):
        mvf_alleles = encode_mvfstring(''.join(record['genotypes']))
        if mvf_alleles:
            mvfentries.append(
                (contig_translate.get(record['contig'], record['contig']),
                 record['coord'], mvf_alleles))
            nentry += 1
            if nentry == args.linebuffer:
                mvf.write_entries(mvfentries, encoded=True)
                mvfentries = []
                nentry = 0
    if mvfentries:
        mvf.write_entries(mvfentries)
        mvfentries = []
    return ''
Exemplo n.º 12
0
def main(arguments=sys.argv[1:]):
    """Main method for maf2mvf"""
    parser = argparse.ArgumentParser(description="""
    Converts Multiple Alignment Files to MVF file with some filters """)
    parser.add_argument("--maf", help="input MAF file")
    parser.add_argument("--out", help="output MVF file")
    parser.add_argument("--reftag", help="old reference tag")
    parser.add_argument("--mvfreflabel", default="REF",
                        help="new label for reference sample (default='REF')")
    parser.add_argument("--contigids", nargs='*',
                        help=("manually specify one or more contig ids"
                              " as ID:NAME"))
    parser.add_argument("--sampletags", nargs="*",
                        help="""one or more TAG:NEWLABEL or TAG, items,
                                if TAG found in sample label, replace with
                                NEW (or TAG if NEW not specified)
                                NEW and TAG must each be unique""")
    parser.add_argument("--linebuffer", type=int, default=100000,
                        help="number of lines to hold in read/write buffer")
    parser.add_argument("--overwrite", action="store_true")
    args = parser.parse_args(args=arguments)
    ## ESTABLISH MAF
    maf = MultiAlignFile(args)
    ## ESTABLISH MVF
    mvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    # PROCESS CONTIG INFO
    # contigs = dict.fromkeys((sorted([x for x in maf.meta['name_index']
    #                             if x.find(args.reftag) > -1])), {})
    # print(contigs)
    # maxcontigid = 0
    # newids = set([])
    # if args.contigids:
    #     for cid, cname in (x.split(':') for x in args.contigids):
    #         for tempid in contigs:
    #             if cname in contigs[tempid]['label']:
    #                 try:
    #                     cid = int(cid)
    #                 except ValueError:
    #                     pass
    #                 mvf.metadata['contigs'][cid] = contigs[tempid].copy()
    #                 del contigs[tempid]
    #                 newids.update([cid])
    #                 break
    #     for cid in newids:
    #         try:
    #             maxcontigid = max([maxcontigid, int(cid) + 1])
    #         except ValueError:
    #             continue
    # tempids = set(contigs.keys()) - newids
    # for tempid, newid in zip(
    #         tempids, xrange(maxcontigid, maxcontigid + len(tempids))):
    #    # mvf.metadata['contigs'][newid] = maf.meta['contigs'][tempid]
    #         pass
    # contig_translate = dict([(mvf.metadata['contigs'][x]['label'], x)
    #                          for x in mvf.metadata['contigs']])
    # PROCESS SAMPLE INFO
    contig_translate = {1: 1}
    samplelabels = [s.split(':')[0] for s in args.sampletags]
    samplelabels.remove(args.reftag)
    samplelabels.insert(0, args.reftag)
    # if args.samplereplace:
    #     newsample = [':' in tuple(x) and x.split(':') or tuple([x,x])
    #                  for x in args.samplereplace]
    mvf.metadata['labels'] = samplelabels[:]
    for i, label in enumerate(samplelabels):
        mvf.metadata['samples'][i] = {'label': label}
    mvf.metadata['ncol'] = len(mvf.metadata['labels'])
    mvf.metadata['sourceformat'] = maf.metadata['sourceformat']
    ## WRITE MVF HEADER
    mvf.write_data(mvf.get_header())
    mvfentries = []
    nentry = 0
    for pos, length, msa in maf:
        for s in samplelabels:
            if s not in msa:
                msa[s] = '-'*length
        msa['contig'] = 1
        for i in range(length):
            mvf_alleles = encode_mvfstring(
                ''.join(msa[s][i].strip() for s in samplelabels))
            if mvf_alleles:
                mvfentries.append(
                    (contig_translate.get(msa['contig']),
                     pos+i, (mvf_alleles,)))
                nentry += 1
                if nentry == args.linebuffer:
                    mvf.write_entries(mvfentries, encoded=True)
                    mvfentries = []
                    nentry = 0
    if mvfentries:
        mvf.write_entries(mvfentries)

    return ''
Exemplo n.º 13
0
def main(arguments=sys.argv[1:]):
    """Main method for mvf2fasta"""
    parser = argparse.ArgumentParser(description="""
    Process MVF into FASTA alignment""")
    parser.add_argument("--mvf", help="input MVF file", required=True)
    parser.add_argument("--out", help="target FASTA file", required=True)
    parser.add_argument("--labeltype", choices=['long', 'short'],
                        default='long',
                        help="long labels with all metadata or short ids")
    parser.add_argument("--regions", nargs='*',
                        help="one or more regions id,start,stop (inclusive)")
    parser.add_argument("--samples", nargs='*',
                        help="one or more taxon labels, leave blank for all")
    parser.add_argument("--outgroups", nargs="*")
    parser.add_argument("--contigs", nargs='*',
                        help="one or more taxon labels, leave blank for all")
    parser.add_argument("--buffer", type=int, default=10,
                        help="size (Mbp) of write buffer for each sample")
    parser.add_argument("--tmpdir", default=".",
                        help="directory to write temporary fasta files")
    parser.add_argument("--quiet", action="store_true", default=True,
                        help="suppress screen output")
    parser.add_argument("-v", "--version", action="store_true",
                        help="display version information")
    args = parser.parse_args(args=arguments)
    if args.version:
        print("Version 2015-02-01: Initial Public Release")
        sys.exit()
    mvf = MultiVariantFile(args.mvf, 'read')
    if args.contigs:
        contigs = dict(mvf.metadata['contigs'][c] for c in args.contigs)
    else:
        contigs = dict(mvf.metadata['contigs'])
    sample_cols = mvf.get_sample_indices(args.samples or None)
    labels = mvf.get_sample_labels(sample_cols)
    current_contig = None
    tmp_files = dict((fn, open(fn+'.tmp', 'w+', args.buffer)) for fn in labels)
    for contig, _, allelesets in mvf.iterentries(
            contigs=args.contigs, subset=sample_cols,
            quiet=args.quiet, decode=True):
        alleles = mvf.decode(allelesets)
        if current_contig != contig:
            current_contig = contig
            for col, label in zip(sample_cols, labels):
                if args.labeltype == 'long':
                    tmp_files[label].write(
                        '\n>{} contig={}  length={}\n{}'.format(
                            label,
                            contigs[current_contig]['label'],
                            contigs[current_contig]['length'],
                            alleles[col]))
                elif args.labeltype == 'short':
                    tmp_files[label].write(
                        '\n>{}_{}\n{}'.format(
                            label, contigs[current_contig]['label'],
                            alleles[col]))
        else:
            for col, label in zip(sample_cols, labels):
                tmp_files[label].write(alleles[col])
    with open(args.out, 'w') as outfile:
        for filehandler in tmp_files.values():
            filehandler.seek(0, 0)
            buff = filehandler.read(args.buffer)
            while  len(buff):
                outfile.write(buff)
                buff = filehandler.read(args.buffer)
            filehandler.close()
            os.remove(os.path.join(args.tmpdir, filehandler.name))
    return ''