Пример #1
0
def main():
    parser = argparse.ArgumentParser(description="""
Read in tree, read in samples from sampleFile, attempt to match tree's leaf IDs with samples,
prune tree to only branches with leaves found in sampleFile, output pruned tree with sample IDs.
"""
    )
    parser.add_argument('treeFile', help='Newick file with IDs similar to Nextstrain')
    parser.add_argument('sampleFile', help='File with one sample ID per line')
    args = parser.parse_args()
    # Very large, deeply nested trees can exceed the default recursion limit of 1000.
    sys.setrecursionlimit(100000)
    # logging.basicConfig(level=logging.DEBUG, filename='debug.log')
    tree = newick.parseFile(args.treeFile)
    samples = utils.listFromFile(args.sampleFile)
    idLookup = virusNames.makeIdLookup(samples)
    for key in idLookup:
        values = idLookup[key]
        if (len(values) != 1):
            logging.warn('Duplicate name/component in ' + args.sampleFile + ': ' + key + " -> " +
                         ", ".join(values))
    foundSampleSet = set()
    tree = newick.treeIntersectIds(tree, idLookup, foundSampleSet, virusNames.lookupSeqName)
    newick.printTree(tree)
    if (len(foundSampleSet) < len(samples)):
        logging.warn("%s has %d samples but pruned tree has %d leaves (%d samples not found)" %
                     (args.sampleFile, len(samples), len(foundSampleSet),
                      len(samples) - len(foundSampleSet)))
        allSampleSet = set(samples)
        sampleFileNotTree = allSampleSet - foundSampleSet
        logging.warn("Example samples not found:\n" +
                     "\n".join(random.sample(sampleFileNotTree, 10)))
Пример #2
0
def main():
    parser = argparse.ArgumentParser(description="""
Read in tree, read samples, find branch of tree that has all of the samples as leaves,
and write out that branch as a new tree.  All samples must exactly match leaf names
and all must be found.
"""
    )
    parser.add_argument('treeFile', help='Newick file with IDs similar to Nextstrain')
    parser.add_argument('sampleFile', help='File with one sample ID per line')
    args = parser.parse_args()
    # Very large, deeply nested trees can exceed the default recursion limit of 1000.
    sys.setrecursionlimit(100000)
    # logging.basicConfig(level=logging.DEBUG, filename='debug.log')
    tree = newick.parseFile(args.treeFile)
    samples = utils.listFromFile(args.sampleFile)
    branch = treeBranchWithSamples(tree, samples)
    newick.printTree(branch)
Пример #3
0
def main():
    parser = argparse.ArgumentParser(description="""
Read in tree, read in samples from VCF, attempt to match tree's leaf IDs with VCF IDs,
prune tree to only branches with leaves found in VCF, output pruned tree with VCF IDs.
""")
    parser.add_argument('treeFile',
                        help='Newick file with IDs similar to Nextstrain')
    parser.add_argument('vcfFile',
                        help='VCF file with IDs similar to Nextstrain')
    args = parser.parse_args()
    # Very large, deeply nested trees can exceed the default recursion limit of 1000.
    sys.setrecursionlimit(100000)
    # logging.basicConfig(level=logging.DEBUG, filename='intersect.log')
    tree = newick.parseFile(args.treeFile)
    vcfSamples = vcf.readSamples(args.vcfFile)
    idLookup = virusNames.makeIdLookup(vcfSamples)
    badKeys = []
    for key in idLookup:
        values = idLookup[key]
        if (len(values) > 3):
            badKeys.append(key)
        elif (len(values) != 1):
            logging.warn('Duplicate name/component in VCF: ' + key + " -> " +
                         ", ".join(values))
    for key in badKeys:
        del idLookup[key]
    sampleSet = set()
    tree = newick.treeIntersectIds(tree, idLookup, sampleSet,
                                   virusNames.lookupSeqName)
    newick.printTree(tree)
    if (len(sampleSet) < len(vcfSamples)):
        logging.warn(
            "VCF has %d samples but pruned tree has %d leaves (%d VCF samples not found)"
            % (len(vcfSamples), len(sampleSet),
               len(vcfSamples) - len(sampleSet)))
        vcfSampleSet = set(vcfSamples)
        vcfNotTree = vcfSampleSet - sampleSet
        logging.warn("Example VCF samples not found:\n" +
                     "\n".join(random.sample(vcfNotTree, 10)))
        vcfOutName = 'intersected.vcf'
        logging.warn("Writing VCF to " + vcfOutName)
        vcf.pruneToSamples(args.vcfFile, sampleSet, vcfOutName)
Пример #4
0
def main():
    parser = argparse.ArgumentParser(description="""
Read in tree, read in samples and lineages, attempt to match tree's leaf IDs with lineage IDs,
add colors corresponding to lineages.
""")
    parser.add_argument('treeFile',
                        help='Newick file with IDs similar to Nextstrain')
    parser.add_argument(
        'lineageFile',
        help='Two-column tab-sep file mapping sample to lineage')
    args = parser.parse_args()
    # Very large, deeply nested trees can exceed the default recursion limit of 1000.
    sys.setrecursionlimit(100000)
    tree = newick.parseFile(args.treeFile)
    sampleLineages = utils.dictFromFile(args.lineageFile)
    treeNames = newick.leafNames(tree)
    idLookup = virusNames.makeIdLookup(treeNames)
    treeLineages = dict([(virusNames.maybeLookupSeqName(name, idLookup), lin)
                         for name, lin in sampleLineages.items()])
    noLinCount = lineageColors.addLineagesAsBogusLength(tree, treeLineages)
    if (noLinCount):
        logging.warn("%d samples had no lineage in %s" %
                     (noLinCount, args.lineageFile))
    newick.printTree(tree)
Пример #5
0
def main():
    parser = argparse.ArgumentParser(description="""
Read tree from Newick treeFile.
Read sample IDs that are a concatenation of EPI ID, sample name and approximate date,
for resolving sampleFile IDs and lineageFile IDs, from a VCF file.
Read lineage assignments from lineageFile.
Figure out what lineage and color (if any) are assigned to each leaf, and then work
back towards root assigning color to each named node whose descendants all have same color.
Write out 3 tab-sep columns:
sampleOrNode, lineage, lineageColor.
"""
    )
    parser.add_argument('treeFile', help='Newick tree whose leaf labels are sample IDs')
    parser.add_argument('vcfFile', help='VCF file with genotype columns for the sample samples')
    parser.add_argument('lineageFile', help='Two-column tab-sep file mapping sample to lineage')
    args = parser.parse_args()

    tree = newick.parseFile(args.treeFile)
    vcfSamples = vcf.readSamples(args.vcfFile)
    idLookup = virusNames.makeIdLookup(vcfSamples)
    lineages = utils.dictFromFile(args.lineageFile)
    nsLineages = dict([ (virusNames.maybeLookupSeqName(name, idLookup), lin)
                        for name, lin in lineages.items() ])
    assignColors(tree, idLookup, nsLineages)