Пример #1
0
def main(arguments=None):
    """Main method"""
    arguments = sys.argv[1:] if arguments is None else arguments
    parser = generate_argparser()
    args = parser.parse_args(args=arguments)
    mvf = MultiVariantFile(args.mvf, 'read')
    flavor = mvf.metadata['flavor']
    if (flavor in ("dna", "rna") and args.outdata == "prot") or (
            flavor == "prot" and args.outdata in ("dna", "rna")):
        raise RuntimeError(
            "--outdata {} incompatiable with '{}' flavor mvf".format(
                args.outdata, flavor))
    sample_cols = mvf.get_sample_indices(args.samples or None)
    labels = mvf.get_sample_labels(sample_cols)
    current_contig = ''
    seqs = {}
    for contig, _, allelesets in mvf.iterentries(quiet=args.quiet,
                                                 decode=True):
        if contig != current_contig:
            if seqs:
                with open(
                        "{}.{}.fa".format(
                            args.outprefix,
                            mvf.metadata['contigs'][contig]['label']),
                        'wt') as outfile:
                    for seqname in sorted(seqs):
                        outfile.write(">{}\n{}\n".format(
                            seqname, ''.join(seqs[seqname])))
            seqs = None
            seqs = {}
            current_contig = contig[:]
        for col, label in zip(sample_cols, labels):
            if label not in seqs:
                seqs[label] = []
            if flavor in ('dna', 'rna'):
                seqs[label].append(allelesets[0][col] == 'X' and 'N'
                                   or allelesets[0][col])
            elif flavor in ('codon', 'prot') and (args.outdata == 'prot'):
                seqs[label].append(allelesets[0][col])
            elif flavor == 'codon' and args.outdata == 'dna':
                seqs[label].extend([
                    allelesets[x][col] == 'X' and 'N' or allelesets[x][col]
                    for x in (1, 2, 3)
                ])
    if seqs:
        with open(
                "{}.{}.fa".format(args.outprefix,
                                  mvf.metadata['contigs'][contig]['label']),
                'wt') as outfile:
            for seqname in sorted(seqs):
                outfile.write(">{}\n{}\n".format(seqname,
                                                 ''.join(seqs[seqname])))
            seqs = None
            seqs = {}
    return ''
Пример #2
0
def main(arguments=sys.argv[1:]):
    """Main MVF Treemaker"""
    parser = argparse.ArgumentParser(
        description="""
    Process MVF into alignment"""
    )
    parser.add_argument("--mvf", help="inputmvf")
    parser.add_argument("--out", help="tree list output file")
    parser.add_argument("--samples", nargs="*", help="one or more taxon labels, default=all")
    parser.add_argument("--raxml_outgroups", nargs="*", help="select outgroups to use in RAxML")
    parser.add_argument(
        "--rootwith",
        nargs="*",
        help="""root output trees with
                                these taxa after RAxML""",
    )
    parser.add_argument("--contigs", nargs="*", help="choose one or more contigs, default=all")
    parser.add_argument("--outputcontiglabels", action="store_true", help="output contig labels instead of ids")
    parser.add_argument("--outputempty", action="store_true", help="output entries of windows with no data")
    parser.add_argument(
        "--hapmode",
        default="none",
        choices=["none", "randomone", "randomboth", "major", "minor", "majorminor"],
        help="""haplotype splitting mode.
                                'none' = no splitting;
                                'randomone' = pick one allele randomly
                                              (recommended);
                                'randomboth = pick alleles randomly,
                                              keep both;
                                'major' = pick the more common allele;
                                'minor' = pick the less common allele;
                                'majorminor' = put the major in 'a' and
                                               minor in 'b'
                            """,
    )
    parser.add_argument(
        "--windowsize",
        type=int,
        default=10000,
        help="""specify genomic region size,
                                or use -1 for whole contig""",
    )
    parser.add_argument("--minsites", type=int, default=100, help="""minimum number of sites [100]""")
    parser.add_argument(
        "--minsitedepth",
        type=int,
        default=1,
        help="""mininum depth of sites to use in alignment
                                [1]""",
    )
    parser.add_argument(
        "--minseqcoverage",
        type=float,
        default=0.1,
        help="""proportion of total alignment a sequence
                                must cover to be retianed [0.1]""",
    )
    parser.add_argument("--mindepth", type=int, default=4, help="""minimum number of sequences [4]""")
    parser.add_argument(
        "--bootstrap",
        type=int,
        help="""turn on rapid bootstrapping for RAxML and
                             perform specified number of replicates""",
    )
    parser.add_argument("--raxml_model", default="GTRGAMMA", help="""choose custom RAxML model [GTRGAMMA]""")
    parser.add_argument("--raxmlpath", help="manually specify RAxML path")
    parser.add_argument("--raxmlopts", default="", help="specify additional RAxML arguments")
    parser.add_argument(
        "--duplicateseq",
        default="dontuse",
        choices=["dontuse", "keep", "remove"],
        help="""[dontuse] remove for tree making,
                                replace as zero-branch-length sister taxa;
                                keep=keep in for tree making,
                                may cause errors for RAxML;
                                remove=remove entirely from alignment""",
    )
    parser.add_argument("--tempdir", default="raxmltemp", help="""temporary dir. location default=./tempdir""")
    parser.add_argument("--tempprefix", default="mvftree", help="""temporary file prefix, default=mvftree""")
    parser.add_argument("--quiet", action="store_true", help="suppress screen output")
    parser.add_argument("-v", "--version", action="store_true", help="display version information")

    args = parser.parse_args(args=arguments)
    if args.version:
        print("Version 2015-02-26")
        sys.exit()
    ## ESTABLISH FILE OBJECTS
    args.contigs = args.contigs or []
    mvf = MultiVariantFile(args.mvf, "read")
    treefile = OutputFile(
        args.out,
        headers=[
            "contig",
            "windowstart",
            "windowsize",
            "tree",
            "topology",
            "topoid",
            # 'templabels', ### USED FOR DEBUGGING ###
            "alignlength",
            "aligndepth",
            "status",
        ],
    )
    topofile = OutputFile(args.out + ".counts", headers=["rank", "topology", "count"])
    sample_cols = args.samples and mvf.get_sample_indices(args.samples) or []
    if args.tempdir:
        tmpdir = os.path.abspath(args.tempdir)
    else:
        tmpdir = os.path.abspath("./raxmltemp")
    if not os.path.exists(tmpdir):
        os.mkdir(tmpdir)
    os.chdir(tmpdir)
    ## SETUP PARAMS
    main_labels = mvf.get_sample_labels(sample_cols)
    if args.hapmode in ["randomboth", "majorminor"]:
        main_labels = [label + x for x in ["a", "b"] for label in main_labels]
    params = {
        "outgroups": args.raxml_outgroups or [],
        "rootwith": args.rootwith or [],
        "minsites": args.minsites,
        "minseqcoverage": args.minseqcoverage,
        "mindepth": args.mindepth,
        "raxmlpath": args.raxmlpath,
        "raxmlopts": args.raxmlopts,
        "duplicateseq": args.duplicateseq,
        "model": args.raxml_model,
        "bootstrap": args.bootstrap,
        "windowsize": args.windowsize,
        "hapmode": args.hapmode,
        "tempdir": tmpdir,
        "tempprefix": args.tempprefix,
    }
    ## WINDOW START INTERATION
    current_contig = ""
    window_start = 0
    window = None
    topo_ids = {}
    topo_counts = {}
    for contig, pos, allelesets in mvf.iterentries(
        contigs=args.contigs,
        subset=sample_cols,
        quiet=args.quiet,
        no_invariant=False,
        no_ambig=False,
        no_gap=False,
        decode=True,
    ):
        if contig != current_contig or (args.windowsize != -1 and (pos > window_start + args.windowsize)):
            if window:
                entry = window.maketree_raxml(params)
                if entry["status"] != "ok":
                    if args.outputempty:
                        treefile.write_entry(entry)
                else:
                    topo = entry["topology"]
                    topo_counts[topo] = topo_counts.get(topo, 0) + 1
                    if topo not in topo_ids:
                        topo_ids[topo] = topo_ids and max(topo_ids.values()) + 1 or 0
                    entry["topoid"] = topo_ids[topo]
                    treefile.write_entry(entry)
                window_start = (
                    (contig == current_contig and args.windowsize != -1) and window_start + args.windowsize or 0
                )
            current_contig = contig[:]
            window = None
            window = WindowData(
                window_params={
                    "contigname": (
                        args.outputcontiglabels and mvf.get_contig_label(current_contig) or current_contig[:]
                    ),
                    "windowstart": (args.windowsize == -1 and "-1" or window_start + 0),
                    "windowsize": args.windowsize,
                    "labels": main_labels[:],
                }
            )
        ## ADD ALLELES
        if args.hapmode != "none":
            allelesets[0] = hapsplit(allelesets[0], args.hapmode)
        window.append_alleles(allelesets[0], minsitedepth=args.minsitedepth)
    ## LAST LOOP
    entry = window.maketree_raxml(params)
    if entry["status"] != "ok":
        if args.outputempty:
            treefile.write_entry(entry)
    else:
        topo = entry["topology"]
        topo_counts[topo] = topo_counts.get(topo, 0) + 1
        if topo not in topo_ids:
            topo_ids[topo] = topo_ids and max(topo_ids.values()) + 1 or 0
        entry["topoid"] = topo_ids[topo]
        treefile.write_entry(entry)
    window = None
    ## END WINDOW ITERATION
    topo_list = sorted([(v, k) for k, v in topo_counts.iteritems()], reverse=True)
    for rank, [value, topo] in enumerate(topo_list):
        topofile.write_entry({"rank": rank, "count": value, "topology": topo})
    return ""
Пример #3
0
def main(arguments=sys.argv[1:]):
    """Main method for mvf_join"""
    parser = argparse.ArgumentParser(description="""
        MVF joining both veritically (separate contigs) and
        and horizontally (different samples)""")
    parser.add_argument("mvf", nargs="*", help="one or more mvf files")
    parser.add_argument("--out", help="output mvf file")
    parser.add_argument("--newcontigs", action="store_true",
                        help="Don't match contigs using labels (not IDs)")
    parser.add_argument("--newsamples", action="store_true",
                        help="Don't match samples using labels")
    parser.add_argument("--linebuffer", type=int, default=100000,
                        help="number of entries to write in a block")
    parser.add_argument("--main_header_file",
                        help="""name of MVF file to use the headers from
                                (default=first in list)""")
    parser.add_argument("--overwrite", action="store_true",
                        help="USE WITH CAUTION: force overwrite of outputs")
    parser.add_argument("--quiet", action="store_true",
                        help="suppress progress meter")
    parser.add_argument("-v", "--version", action="store_true",
                        help="display version information")
    args = parser.parse_args(args=arguments)
    if args.version:
        print("Version 2015-02-01: Initial Public Release")
        sys.exit()
    concatmvf = MultiVariantFile(args.out, 'write', overwrite=args.overwrite)
    ## Copy the first file's metadata
    if args.main_header_file:
        if args.main_header_file not in args.mvf:
            raise RuntimeError("{} not found in files".format(
                args.main_header_file))
        else:
            args.main_header_file = args.mvf.index(args.main_header_file)
    else:
        args.main_header_file = 0
    first_mvf = MultiVariantFile(args.mvf[args.main_header_file], 'read')
    concatmvf.metadata = first_mvf.metadata.copy()
    ## Open each MVF file, read headers to make unified header
    transformers = []
    for mvfname in args.mvf:
        ## This will create a dictionary of samples{old:new}, contigs{old:new}
        transformer = MvfTransformer()
        mvf = MultiVariantFile(mvfname, 'read')
        for i, label in enumerate(mvf.get_sample_labels()):
            if label not in concatmvf.get_sample_labels():
                concatmvf.metadata['labels'].append(label)
                concatmvf.metadata['samples'][
                    concatmvf.metadata['labels'].index(label)] = {
                        'label': label}
            if concatmvf.metadata['labels'].index(label) != i:
                transformer.set_label(
                    i, concatmvf.metadata['labels'].index(label))
        for contigid, contigdata in mvf.metadata['contigs'].iteritems():
            if contigdata['label'] not in [
                    concatmvf.metadata['contigs'][x]['label']
                    for x in concatmvf.metadata['contigs']]:
                newid = (contigid not in concatmvf.metadata['contigs']
                         and contigid or concatmvf.get_next_contig_id())
                concatmvf.metadata['contigs'][newid] = contigdata
            else:
                for concatid, concatdata in (
                        concatmvf.metadata['contigs'].items()):
                    if contigdata['label'] == concatdata['label']:
                        newid = concatid
                        break
            if newid != contigid:
                transformer.set_contig(contigid, newid)
        transformers.append(transformer)
    ## Write output header
    concatmvf.write_data(concatmvf.get_header())
    ## Now loop through each file
    entries = []
    nentries = 0
    for ifile, mvfname in enumerate(args.mvf):
        if not args.quiet:
            sys.stderr.write("Processing {} ...\n".format(mvfname))
        transformer = transformers[ifile]
        mvf = MultiVariantFile(mvfname, 'read')
        for contigid, pos, allelesets in mvf.iterentries(decode=False,
                                                         quiet=args.quiet):
            if transformer.labels:
                allelesets = [mvf.decode(x) for x in allelesets]
                for j, alleles in enumerate(allelesets):
                    allelesets[j] = concatmvf.encode(''.join([
                        x in transformer.labels
                        and alleles[transformer.labels[x]] or alleles[x]
                        for x in xrange(len(alleles))]))
            if transformer.contigs:
                contigid = (contigid in transformer['contigs']
                            and transformer['contigs'][contigid]
                            or contigid)
            entries.append((contigid, pos, allelesets))
            nentries += 1
            if nentries == args.linebuffer:
                concatmvf.write_entries(entries)
                entries = []
                nentries = 0
        if entries:
            concatmvf.write_entries(entries)
            entries = []
            nentries = 0
        if not args.quiet:
            sys.stderr.write("done\n")
    return ''
Пример #4
0
def calc_pairwise_dnds(args):
    """Calculates Pairwise dNdS using PAML among pairse of sequences
       """
    mvf = MultiVariantFile(args.mvf, 'read')
    annotations = {}
    coordinates = {}
    if args.gff:
        annotations, coordinates = (parse_gff_annotate(args.gff))
    labels = mvf.get_sample_labels()[:]
    ncol = len(labels)
    current_contig = None
    current_position = 0
    counts = Counter()
    totals = Counter()
    if self.params['output_align']:
        outputalign = []
    fieldtags = [
        'likelihood', 'bgdnds0', 'bgdnds1', 'bgdnds2a', 'bgdnds2b', 'fgdnds0',
        'fgdnds1', 'fgdnds2a', 'fgdnds2b', 'dndstree', 'errorstate'
    ]
    with open(self.params['branchlrt'], 'w') as branchlrt:
        genealign = []
        branchlrt.write("\t".join(
            ['contig', 'ntaxa', 'alignlength', 'lrtscore'] +
            ["null.{}".format(x)
             for x in fieldtags] + ["test.{}".format(x)
                                    for x in fieldtags] + ['tree']) + "\n")
    groups = self.params['allele_groups'].values()
    speciesgroups = self.params['speciesgroups'].values()
    allsets = set([])
    for group in groups:
        allsets.update(group)
    allsets = list(sorted(allsets))
    speciesrev = {}
    for species in self.params['speciesgroups']:
        speciesrev.update([(x, species)
                           for x in self.params['speciesgroups'][species]])
    if self.params['mincoverage']:
        if self.params['mincoverage'] < len(groups) * 2:
            raise RuntimeError("""
                Error: GroupUniqueAlleleWindow:
                --mincoverage cannot be lower than the twice the number
                of specified groups in --allele-groups
                """)
    for contig, pos, allelesets in mvf:
        if not current_contig:
            current_contig = contig[:]
        if contig != current_contig or (
                self.params['windowsize'] != -1
                and pos > current_position + self.params['windowsize']):
            xkey = (
                current_contig,
                current_position,
            )
            self.data[xkey] = counts.copy()
            self.data[xkey].update([
                ('contig', (self.params['uselabels']
                            and mvf.get_contig_label(current_contig))),
                ('position', current_position),
                ('nonsynyonymous_changes',
                 counts.get('nonsynonymous_changes', 0) or 0),
                ('synyonymous_changes', counts.get('synonymous_changes', 0)
                 or 0)
            ])
            self.data[xkey].update([
                ('ns_ratio',
                 (float(self.data[xkey].get('nonsynonymous_changes', 0)) /
                  (self.data[xkey].get('synonymous_changes', 1.0)))),
                ('annotation', annotations.get(self.data[xkey]['contig'],
                                               '.')),
                ('coordinates', coordinates.get(self.data[xkey]['contig'],
                                                '.'))
            ])
            if genealign:
                if (self.params.get('endcontig', 1000000) >=
                        int(current_contig)) and (self.params.get(
                            'startcontig', 0) <= int(current_contig)):
                    # print(current_contig)
                    (dnval, dsval) = paml_pwcalc_dnds(genealign)
                    with open(self.params['branchlrt'], 'a') as branchlrt:
                        branchlrt.write("\t".join([
                            str(x) for x in [
                                self.data[xkey]['contig'],
                                len(genealign),
                                len(genealign[0]) * 3, dnval, dsval
                            ]
                        ]) + "\n")
            genealign = None
            totals.add('genes_total')
            if counts.get('total_codons', 0) > 0:
                totals.add('genes_tested')
            if counts.get('total_nsyn_codons', 0) > 0:
                totals.add('genes_with_nsyn')
            if contig != current_contig:
                current_contig = contig[:]
                current_position = 0
            elif self.params['windowsize'] != -1:
                current_position += self.params['windowsize']
            counts = Counter()
        proteins = allelesets[0]
        codons = allelesets[1:4]
        if len(proteins) == 1 and all(len(x) == 1 for x in codons):
            if proteins == '*' or ''.join(codons) in MLIB.stop_codons:
                continue
            counts.add('total_codons')
            totals.add('total_codons')
            if self.params['output_align']:
                if not outputalign:
                    outputalign = [[''.join(codons)]
                                   for x in range(mvf.metadata['ncol'])]
                else:
                    for ialign in range(len(outputalign)):
                        outputalign[ialign].append(''.join(codons))
            if self.params['branchlrt']:
                if not genealign:
                    genealign = [[''.join(codons)] for x in range(ncol)]
                else:
                    for ialign in range(len(genealign)):
                        genealign[ialign].append(''.join(codons))
            continue
        if len(proteins) > 1:
            if allelesets[0][1] == '+':
                continue
        proteins = mvf.decode(proteins)
        if self.params['mincoverage']:
            if sum([int(x not in 'X-')
                    for x in proteins]) < (self.params['mincoverage']):
                continue
        species_groups = [[proteins[i] for i in x if proteins[i] not in '-X']
                          for x in speciesgroups]
        if any(len(x) == 0 for x in species_groups):
            continue
        xcodons = [mvf.decode(x) for x in codons]
        codons = [''.join(x) for x in zip(*xcodons)]
        if any(codons[x] in MLIB.stop_codons for x in allsets):
            continue
        if any(
                any(x != species_groups[0][0] for x in y)
                for y in species_groups):
            totals.add('total_nsyn_codons')
            counts.add('total_nsyn_codons')
        totals.add('total_codons')
        totals.add('tested_codons')
        counts.add('total_codons')
        totals.add('variable_codons',
                   val=int(
                       sum([int(len(set(x) - set('X-')) > 1)
                            for x in xcodons]) > 0))
        if self.params['output_align']:
            if not outputalign:
                outputalign = [[x] for x in codons]
            else:
                for ialign in range(len(outputalign)):
                    outputalign[ialign].append(codons[ialign])
        if self.params['branchlrt']:
            if not genealign:
                genealign = [[x] for x in codons]
            else:
                for ialign in range(len(codons)):
                    genealign[ialign].append(codons[ialign])
        nonsyn_change = False
        synon_change = False
        codon_groups = [
            set([
                codons[i] for i in x
                if '-' not in codons[i] and 'X' not in codons[i]
            ]) for x in groups
        ]
        protein_groups = None
        for i in range(len(codon_groups)):
            if any(base in codon for base in 'RYWKMS'
                   for codon in codon_groups[i]):
                codon_groups[i] = hapgroup(codon_groups[i])
        if all(
                grp1.isdisjoint(grp0)
                for grp0, grp1 in combinations(codon_groups, 2)):
            protein_groups = [
                set([
                    MLIB.codon_table['full'][''.join(x)]
                    for x in codon_groups[i]
                ]) for i in range(len(codon_groups))
            ]
            if all(
                    grp1.isdisjoint(grp0)
                    for grp0, grp1 in combinations(protein_groups, 2)):
                nonsyn_change = True
            elif all(grp1 == grp0
                     for grp0, grp1 in combinations(protein_groups, 2)):
                synon_change = True
        if nonsyn_change:
            print('NON', contig, pos, allelesets, codon_groups, protein_groups,
                  groups, mvf.get_contig_label(contig))
            counts.add('nonsynonymous_changes')
            totals.add('nonsynonymous_changes')
        elif synon_change:
            print('SYN', contig, pos, allelesets, codon_groups, protein_groups,
                  groups, mvf.get_contig_label(contig))
            counts.add('synonymous_changes')
            totals.add('synonymous_changes')
    self.params['totals'] = totals
    self.write()
    if self.params['output_align']:
        with open(self.params['output_align'], 'w') as alignfile:
            alignfile.write("\n".join([
                ">{}\n{}".format(mvf.metadata['labels'][i],
                                 ''.join(outputalign[i]))
                for i in range(len(outputalign))
            ]))
    return ''
Пример #5
0
def main(arguments=sys.argv[1:]):
    """Main method for mvf2fasta"""
    parser = argparse.ArgumentParser(description="""
    Process MVF into FASTA alignment""")
    parser.add_argument("--mvf", help="input MVF file", required=True)
    parser.add_argument("--out", help="target FASTA file", required=True)
    parser.add_argument("--labeltype", choices=['long', 'short'],
                        default='long',
                        help="long labels with all metadata or short ids")
    parser.add_argument("--regions", nargs='*',
                        help="one or more regions id,start,stop (inclusive)")
    parser.add_argument("--samples", nargs='*',
                        help="one or more taxon labels, leave blank for all")
    parser.add_argument("--outgroups", nargs="*")
    parser.add_argument("--contigs", nargs='*',
                        help="one or more taxon labels, leave blank for all")
    parser.add_argument("--buffer", type=int, default=10,
                        help="size (Mbp) of write buffer for each sample")
    parser.add_argument("--tmpdir", default=".",
                        help="directory to write temporary fasta files")
    parser.add_argument("--quiet", action="store_true", default=True,
                        help="suppress screen output")
    parser.add_argument("-v", "--version", action="store_true",
                        help="display version information")
    args = parser.parse_args(args=arguments)
    if args.version:
        print("Version 2015-02-01: Initial Public Release")
        sys.exit()
    mvf = MultiVariantFile(args.mvf, 'read')
    if args.contigs:
        contigs = dict(mvf.metadata['contigs'][c] for c in args.contigs)
    else:
        contigs = dict(mvf.metadata['contigs'])
    sample_cols = mvf.get_sample_indices(args.samples or None)
    labels = mvf.get_sample_labels(sample_cols)
    current_contig = None
    tmp_files = dict((fn, open(fn+'.tmp', 'w+', args.buffer)) for fn in labels)
    for contig, _, allelesets in mvf.iterentries(
            contigs=args.contigs, subset=sample_cols,
            quiet=args.quiet, decode=True):
        alleles = mvf.decode(allelesets)
        if current_contig != contig:
            current_contig = contig
            for col, label in zip(sample_cols, labels):
                if args.labeltype == 'long':
                    tmp_files[label].write(
                        '\n>{} contig={}  length={}\n{}'.format(
                            label,
                            contigs[current_contig]['label'],
                            contigs[current_contig]['length'],
                            alleles[col]))
                elif args.labeltype == 'short':
                    tmp_files[label].write(
                        '\n>{}_{}\n{}'.format(
                            label, contigs[current_contig]['label'],
                            alleles[col]))
        else:
            for col, label in zip(sample_cols, labels):
                tmp_files[label].write(alleles[col])
    with open(args.out, 'w') as outfile:
        for filehandler in tmp_files.values():
            filehandler.seek(0, 0)
            buff = filehandler.read(args.buffer)
            while  len(buff):
                outfile.write(buff)
                buff = filehandler.read(args.buffer)
            filehandler.close()
            os.remove(os.path.join(args.tmpdir, filehandler.name))
    return ''