예제 #1
0
def phytozome(args):
    """
    %prog phytozome species

    Retrieve genomes and annotations from phytozome FTP. Available species
    listed below. Use comma to give a list of species to download. For example:

    $ %prog phytozome Athaliana,Vvinifera,Osativa,Sbicolor,Slycopersicum
    """
    from jcvi.formats.gff import bed as gff_bed
    from jcvi.formats.fasta import format as fasta_format

    p = OptionParser(phytozome.__doc__)
    p.add_option("--version", default="9.0",
                 help="Phytozome version [default: %default]")
    p.add_option("--assembly", default=False, action="store_true",
                 help="Download assembly [default: %default]")
    p.add_option("--format", default=False, action="store_true",
                 help="Format to CDS and BED for synteny inference")
    opts, args = p.parse_args(args)

    url = "ftp://ftp.jgi-psf.org/pub/compgen/phytozome/v{0}/".\
                    format(opts.version)
    valid_species = [x for x in ls_ftp(url) if "." not in x]

    doc = "\n".join((phytozome.__doc__, tile(valid_species)))
    p.set_usage(doc)

    if len(args) != 1:
        sys.exit(not p.print_help())

    species, = args
    if species == "all":
        species = ",".join(valid_species)

    species = species.split(",")
    use_IDs = set()
    # We have to watch out when the gene names and mRNA names mismatch, in which
    # case we just extract the mRNA names
    use_mRNAs = set(["Cclementina", "Creinhardtii", "Csinensis", "Fvesca",
                    "Lusitatissimum", "Mesculenta", "Mguttatus", "Ppersica",
                    "Pvirgatum", "Rcommunis", "Sitalica", "Tcacao",
                    "Thalophila", "Vcarteri", "Vvinifera", "Zmays"])

    for s in species:
        gff, fa = download_species_phytozome(s, valid_species, url,
                                             assembly=opts.assembly)
        key = "ID" if s in use_IDs else "Name"
        ttype = "mRNA" if s in use_mRNAs else "gene"
        if not opts.format:
            continue

        bedfile = s + ".bed"
        cdsfile = s + ".cds"
        gff_bed([gff, "--type={}".format(ttype), "--key={}".format(key),
                 "-o", bedfile])
        fasta_format([fa, cdsfile, r"--sep=|"])
예제 #2
0
파일: fetch.py 프로젝트: tanghaibao/jcvi
def phytozome(args):
    """
    %prog phytozome species

    Retrieve genomes and annotations from phytozome FTP. Available species
    listed below. Use comma to give a list of species to download. For example:

    $ %prog phytozome Athaliana,Vvinifera,Osativa,Sbicolor,Slycopersicum
    """
    from jcvi.formats.gff import bed as gff_bed
    from jcvi.formats.fasta import format as fasta_format

    p = OptionParser(phytozome.__doc__)
    p.add_option("--version", default="9.0",
                 help="Phytozome version [default: %default]")
    p.add_option("--assembly", default=False, action="store_true",
                 help="Download assembly [default: %default]")
    p.add_option("--format", default=False, action="store_true",
                 help="Format to CDS and BED for synteny inference")
    opts, args = p.parse_args(args)

    url = "ftp://ftp.jgi-psf.org/pub/compgen/phytozome/v{0}/".\
        format(opts.version)
    valid_species = [x for x in ls_ftp(url) if "." not in x]

    doc = "\n".join((phytozome.__doc__, tile(valid_species)))
    p.set_usage(doc)

    if len(args) != 1:
        sys.exit(not p.print_help())

    species, = args
    if species == "all":
        species = ",".join(valid_species)

    species = species.split(",")
    use_IDs = set()
    # We have to watch out when the gene names and mRNA names mismatch, in which
    # case we just extract the mRNA names
    use_mRNAs = set(["Cclementina", "Creinhardtii", "Csinensis", "Fvesca",
                     "Lusitatissimum", "Mesculenta", "Mguttatus", "Ppersica",
                     "Pvirgatum", "Rcommunis", "Sitalica", "Tcacao",
                     "Thalophila", "Vcarteri", "Vvinifera", "Zmays"])

    for s in species:
        gff, fa = download_species_phytozome(s, valid_species, url,
                                             assembly=opts.assembly)
        key = "ID" if s in use_IDs else "Name"
        ttype = "mRNA" if s in use_mRNAs else "gene"
        if not opts.format:
            continue

        bedfile = s + ".bed"
        cdsfile = s + ".cds"
        gff_bed([gff, "--type={}".format(ttype), "--key={}".format(key),
                 "-o", bedfile])
        fasta_format([fa, cdsfile, r"--sep=|"])
예제 #3
0
파일: fetch.py 프로젝트: lizhencmb/jcvi
def phytozome(args):
    """
    %prog phytozome species

    Retrieve genomes and annotations from phytozome FTP. Available species
    listed below. Use comma to give a list of species to download. For example:

    $ %prog phytozome Athaliana,Vvinifera,Osativa,Sbicolor,Slycopersicum
    """
    from jcvi.formats.gff import bed as gff_bed
    from jcvi.formats.fasta import format as fasta_format

    p = OptionParser(phytozome.__doc__)
    p.add_option("--version",
                 default="9.0",
                 help="Phytozome version [default: %default]")
    p.add_option("--assembly",
                 default=False,
                 action="store_true",
                 help="Download assembly [default: %default]")
    p.add_option("--format",
                 default=False,
                 action="store_true",
                 help="Format to CDS and BED for synteny inference")
    opts, args = p.parse_args(args)

    url = "ftp://ftp.jgi-psf.org/pub/compgen/phytozome/v{0}/".\
                    format(opts.version)
    valid_species = [x for x in ls_ftp(url) if "." not in x]

    doc = "\n".join((phytozome.__doc__, tile(valid_species)))
    p.set_usage(doc)

    if len(args) != 1:
        sys.exit(not p.print_help())

    species, = args
    species = species.split(",")
    for s in species:
        gff, fa = download_species_phytozome(s,
                                             valid_species,
                                             url,
                                             assembly=opts.assembly)
        if not opts.format:
            continue

        bedfile = s + ".bed"
        cdsfile = s + ".cds"
        gff_bed([gff, "--phytozome", "-o", bedfile])
        fasta_format([fa, cdsfile, r"--sep=|"])
예제 #4
0
def format_bed_and_cds(species, gff, cdsfa):
    """Run gff.format() and fasta.format() to generate BED and CDS files.
    This prepares the input files for the MCscan synteny workflow.

    https://github.com/tanghaibao/jcvi/wiki/MCscan-(Python-version)

    Args:
        species (str): Name of the species
        gff (str): Path to the GFF file
        fa (str): Path to the FASTA file
    """
    from jcvi.formats.gff import bed as gff_bed
    from jcvi.formats.fasta import format as fasta_format

    # We have to watch out when the gene names and mRNA names mismatch, in which
    # case we just extract the mRNA names
    use_IDs = set()
    use_mRNAs = set([
        "Cclementina",
        "Creinhardtii",
        "Csinensis",
        "Fvesca",
        "Lusitatissimum",
        "Mesculenta",
        "Mguttatus",
        "Ppersica",
        "Pvirgatum",
        "Rcommunis",
        "Sitalica",
        "Tcacao",
        "Thalophila",
        "Vcarteri",
        "Vvinifera",
        "Zmays",
    ])
    key = "ID" if species in use_IDs else "Name"
    ttype = "mRNA" if species in use_mRNAs else "gene"
    bedfile = species + ".bed"
    cdsfile = species + ".cds"
    gff_bed([
        gff, "--type={}".format(ttype), "--key={}".format(key), "-o", bedfile
    ])
    fasta_format([cdsfa, cdsfile, r"--sep=|"])