Пример #1
0
        default='data/',
        help='optional, directory to write TreeTime output files')
    parser.add_argument('--ft2bin',
                        default='fasttree2',
                        help='optional, path to fasttree2 binary executable')
    parser.add_argument('--ttbin',
                        default='treetime',
                        help='optional, path to treetime binary executable')
    return parser.parse_args()


if __name__ == '__main__':
    args = parse_args()
    cb = Callback()

    cb.callback("Retrieving genomes")
    fasta = retrieve_genomes(args.db, ref_file=args.ref, misstol=args.misstol)

    cb.callback("Reconstructing tree with {}".format(args.ft2bin))
    nwk = fasttree(fasta, binpath=args.ft2bin)

    cb.callback("Reconstructing time-scaled tree with {}").format(args.ttbin)
    nexus_file = treetime(nwk,
                          fasta,
                          outdir=args.outdir,
                          binpath=args.ttbin,
                          clock=args.clock)

    cb.callback("")
    parse_nexus(nexus_file, fasta, date_tol=args.datetol)
Пример #2
0
    parser.add_argument('--ttbin', default='treetime',
                        help='optional, path to treetime binary executable')
    parser.add_argument('--lineages', type=str,
                        default=os.path.join(covizu.__path__[0], "data/pango-designation/lineages.csv"),
                        help="optional, path to CSV file containing Pango lineage designations.")

    parser.add_argument('--outfile', default='data/timetree.nwk',
                        help='output, path to write Newick tree string')
    return parser.parse_args()


if __name__ == '__main__':
    args = parse_args()
    cb = Callback()

    cb.callback("Retrieving genomes")
    with open(args.json) as handle:
        by_lineage = json.load(handle)

    cb.callback("Parsing Pango lineage designations")
    handle = open(args.lineages)
    header = next(handle)
    if header != 'taxon,lineage\n':
        cb.callback("Error: {} does not contain expected header row 'taxon,lineage'".format(args.lineages))
        sys.exit()
    lineages = {}
    for line in handle:
        taxon, lineage = line.strip().split(',')
        lineages.update({taxon: lineage})

    cb.callback("Identifying lineage representative genomes")
    aligned = gisaid_utils.extract_features(batcher, ref_file=args.ref, binpath=args.mmbin,
                                            nthread=args.mmthreads, minlen=args.minlen)
    filtered = gisaid_utils.filter_problematic(aligned, vcf_file=args.vcf, cutoff=args.poisson_cutoff,
                                               callback=callback)
    return gisaid_utils.sort_by_lineage(filtered, callback=callback)


if __name__ == "__main__":
    args = parse_args()
    cb = Callback()

    # check that user has loaded openmpi module
    try:
        subprocess.check_call(['mpirun', '-np', '2', 'ls'], stdout=subprocess.DEVNULL)
    except FileNotFoundError:
        cb.callback("mpirun not loaded - run `module load openmpi/gnu`", level='ERROR')
        sys.exit()

    by_lineage = process_local(args, cb.callback)
    with open(args.bylineage, 'w') as handle:
        # export to file to process large lineages with MPI
        json.dump(by_lineage, handle)

    # reconstruct time-scaled tree
    timetree, residuals = build_timetree(by_lineage, args, cb.callback)
    timestamp = datetime.now().isoformat().split('.')[0]
    nwk_file = os.path.join(args.outdir, 'timetree.{}.nwk'.format(timestamp))
    with open(nwk_file, 'w') as handle:
        Phylo.write(timetree, file=handle, format='newick')

    # generate beadplots and serialize to file
Пример #4
0
    aligned = gisaid_utils.extract_features(batcher, ref_file=args.ref, binpath=args.mmbin,
                                            nthread=args.mmthreads, minlen=args.minlen)
    filtered = gisaid_utils.filter_problematic(aligned, vcf_file=args.vcf, cutoff=args.poisson_cutoff,
                                               callback=callback)
    return gisaid_utils.sort_by_lineage(filtered, callback=callback)


if __name__ == "__main__":
    args = parse_args()
    cb = Callback()

    # check that user has loaded openmpi module
    try:
        subprocess.check_call(['mpirun', '-np', '2', 'ls'], stdout=subprocess.DEVNULL)
    except FileNotFoundError:
        cb.callback("mpirun not loaded - run `module load openmpi/gnu`", level='ERROR')
        sys.exit()

    # check that the user has included submodules
    if (not os.path.exists(os.path.join(covizu.__path__[0], "data/pango-designation/lineages.csv")) or 
            not os.path.exists(os.path.join(covizu.__path__[0], "data/ProblematicSites_SARS-CoV2/problematic_sites_sarsCov2.vcf"))):
        try:
            subprocess.check_call("git submodule init; git submodule update", shell=True)
        except:
            cb.callback("Error adding the required submodules")
            sys.exit()

    # update submodules
    try:
        subprocess.check_call("git submodule foreach git pull origin master", shell=True)
    except:
Пример #5
0
            if my_rank == 0:
                trees = [phy for batch in result
                         for phy in batch]  # flatten nested lists
                Phylo.write(trees, file=outfile, format='newick')

    elif args.mode == 'flat':
        # load list of lineages from text file
        minor_lineages = []
        with open(args.lineage) as handle:
            for line in handle:
                minor_lineages.append(line.strip())

        for li, lineage in enumerate(minor_lineages):
            if li % nprocs != my_rank:
                continue
            cb.callback("starting {}".format(lineage))
            union, labels, indexed = unpack_recoded(recoded,
                                                    lineage,
                                                    callback=cb.callback)

            lineage_name = lineage.replace('/', '_')  # issue #297
            outfile = os.path.join(args.outdir, '{}.nwk'.format(lineage_name))
            if len(indexed) == 1:
                # lineage only has one variant, no meaningful tree
                with open(outfile, 'w') as handle:
                    handle.write('({}:0);\n'.format(labels['0'][0]))
            else:
                trees = [
                    bootstrap(union,
                              indexed,
                              args.binpath,
Пример #6
0
        args.url = os.environ["GISAID_URL"]
    if args.user is None and "GISAID_USER" in os.environ:
        args.user = os.environ["GISAID_USER"]
        # otherwise download_feed() will prompt for username
    if args.password is None and "GISAID_PSWD" in os.environ:
        args.password = os.environ["GISAID_PSWD"]
        # otherwise download_feed() will prompt for password

    return args


if __name__ == '__main__':
    args = parse_args()
    cb = Callback()

    cb.callback("Processing GISAID feed data")

    # download xz file if not specified by user
    if args.infile is None:
        args.infile = download_feed(args.url, args.user, args.password)

    loader = load_gisaid(args.infile,
                         minlen=args.minlen,
                         mindate=args.mindate,
                         debug=args.debug)
    batcher = batch_fasta(loader, size=args.batchsize)
    aligned = extract_features(batcher,
                               ref_file=args.ref,
                               binpath=args.binpath,
                               nthread=args.mmthreads,
                               minlen=args.minlen)
Пример #7
0
    try:
        from mpi4py import MPI
    except ModuleNotFoundError:
        print("Script requires mpi4py - https://pypi.org/project/mpi4py/")
        sys.exit()

    comm = MPI.COMM_WORLD
    my_rank = comm.Get_rank()
    nprocs = comm.Get_size()

    # command-line execution
    args = parse_args()
    cb = Callback(t0=args.timestamp, my_rank=my_rank, nprocs=nprocs)

    # import lineage data from file
    cb.callback('loading JSON')
    with open(args.json) as handle:
        by_lineage = json.load(handle)

    records = by_lineage.get(args.lineage, None)
    if records is None:
        cb.callback("ERROR: JSON did not contain lineage {}".format(
            args.lineage))
        sys.exit()

    # generate distance matrices from bootstrap samples [[ MPI ]]
    union, labels, indexed = recode_features(records,
                                             callback=cb.callback,
                                             limit=args.max_variants)

    # export map of sequence labels to tip indices
Пример #8
0
                                               vcf_file=args.vcf,
                                               cutoff=args.poisson_cutoff,
                                               callback=callback)
    return gisaid_utils.sort_by_lineage(filtered, callback=callback)


if __name__ == "__main__":
    args = parse_args()
    cb = Callback()

    # check that user has loaded openmpi module
    try:
        subprocess.check_call(['mpirun', '-np', '2', 'ls'],
                              stdout=subprocess.DEVNULL)
    except FileNotFoundError:
        cb.callback("mpirun not loaded - run `module load openmpi/gnu`",
                    level='ERROR')
        sys.exit()

    # check that the user has included submodules
    if (not os.path.exists(
            os.path.join(covizu.__path__[0],
                         "data/pango-designation/lineages.csv")
    ) or not os.path.exists(
            os.path.join(
                covizu.__path__[0],
                "data/ProblematicSites_SARS-CoV2/problematic_sites_sarsCov2.vcf"
            ))):
        try:
            subprocess.check_call("git submodule init; git submodule update",
                                  shell=True)
        except:
Пример #9
0
                        "with `--threads 1`.")
    parser.add_argument(
        "--cutoff",
        type=float,
        default=0.5,
        help="Bootstrap cutoff for consensus tree (default 0.5). "
        "Only used if --cons is specified.")
    return parser.parse_args()


if __name__ == "__main__":
    # command-line execution
    args = parse_args()
    cb = Callback()

    cb.callback('loading lineage classifications from database')
    lineages = db_utils.dump_lineages(args.db)

    cb.callback('loading JSON')
    features = import_json(args.json, vcf_file=args.vcf, callback=cb.callback)

    by_lineage = split_by_lineage(features, lineages)
    for lineage, lfeatures in by_lineage.items():
        cb.callback('start {}, {} entries'.format(lineage, len(lfeatures)))

        # calculate symmetric difference matrix and run NJ on bootstrap samples
        filtered = seq_utils.filter_outliers(lfeatures)
        trees, labels = build_trees(filtered,
                                    nboot=args.nboot,
                                    threads=args.threads,
                                    callback=cb.callback)
                                               vcf_file=args.vcf,
                                               cutoff=args.poisson_cutoff,
                                               callback=callback)
    return gisaid_utils.sort_by_lineage(filtered, callback=callback)


if __name__ == "__main__":
    args = parse_args()
    cb = Callback()

    # check that user has loaded openmpi module
    try:
        subprocess.check_call(['mpirun', '-np', '2', 'ls'],
                              stdout=subprocess.DEVNULL)
    except FileNotFoundError:
        cb.callback("mpirun not loaded - run `module load openmpi/gnu`",
                    level='ERROR')
        sys.exit()

    # download xz file if not specified by user
    if args.infile is None:
        cb.callback("No input specified, downloading data from GISAID feed...")
        args.infile = gisaid_utils.download_feed(args.url, args.user,
                                                 args.password)

    by_lineage = process_feed(args, cb.callback)
    with open(args.bylineage, 'w') as handle:
        # export to file to process large lineages with MPI
        json.dump(by_lineage, handle)

    timetree, residuals = build_timetree(by_lineage, args, cb.callback)
Пример #11
0
        "Only used if --cons is specified.")

    parser.add_argument("outfile",
                        type=argparse.FileType('w'),
                        default='data/clusters.json',
                        help="output, dest for JSON beadplot file")

    return parser.parse_args()


if __name__ == "__main__":
    args = parse_args()
    cb = Callback()

    # Generate time-scaled tree of Pangolin lineages
    cb.callback("Retrieving lineage genomes")
    fasta = treetime.retrieve_genomes(args.db,
                                      nthread=args.mmthreads,
                                      ref_file=args.ref,
                                      misstol=args.misstol,
                                      callback=cb.callback)

    cb.callback("Reconstructing tree with {}".format(args.ft2bin))
    nwk = treetime.fasttree(fasta, binpath=args.ft2bin)

    cb.callback("Reconstructing time-scaled tree with {}".format(args.ttbin))
    nexus_file = treetime.treetime(nwk,
                                   fasta,
                                   outdir=args.outdir,
                                   binpath=args.ttbin,
                                   clock=args.clock,