예제 #1
0
def download_species_ensembl(species, valid_species, url):
    assert species in valid_species, "{0} is not in the species list".format(species)

    # We want to download assembly and annotation for given species
    ann_url = urljoin(url, "gtf/{0}".format(species))
    cds_url = urljoin(url, "fasta/{0}/cds".format(species))

    for u in (ann_url, cds_url):
        valid_files = [x for x in ls_ftp(u) if x.endswith(".gz")]
        for f in valid_files:
            f = urljoin(u, f)
            download(f)
예제 #2
0
파일: fetch.py 프로젝트: yangjl/jcvi
def download_species_ensembl(species, valid_species, url):
    assert species in valid_species, \
            "{0} is not in the species list".format(species)

    # We want to download assembly and annotation for given species
    ann_url = urljoin(url, "gtf/{0}".format(species))
    cds_url = urljoin(url, "fasta/{0}/cds".format(species))

    for u in (ann_url, cds_url):
        valid_files = [x for x in ls_ftp(u) if x.endswith(".gz")]
        for f in valid_files:
            f = urljoin(u, f)
            download(f)
예제 #3
0
파일: fetch.py 프로젝트: yangjl/jcvi
def sra(args):
    """
    %prog sra term

    Given an SRA run ID, fetch the corresponding .sra file
    from the sra-instant FTP
    """
    sra_base_url = "ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/"
    sra_run_id_re = re.compile(r'^([DES]{1}RR)(\d{3})(\d{3,4})$')

    p = OptionParser(sra.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    term, = args

    m = re.search(sra_run_id_re, term)
    if m is None:
        logging.error("Incorrect SRA identifier format " + \
                "[should be like SRR126150, SRR1001901. " + \
                "len(identifier) should be between 9-10 characters]")
        sys.exit()

    prefix, subprefix = m.group(1), "{0}{1}".format(m.group(1), m.group(2))
    download_url = urljoin(sra_base_url, prefix, subprefix, term, "{0}.sra".format(term))

    logging.debug("Downloading file: {0}".format(download_url))
    download(download_url)
예제 #4
0
    def request(self, urn: str, params: dict):
        """ Send a request to CoinMarketCap

        Parameters
        ----------
        urn : `str`
            the endpoints, E.g "cryptocurrency/info"
        params : `dict`
            the parameters for the request

        Raises
        ------
        requests.exceptions.HTTPError
            If status code is not 200
        """
        url = Request("GET", urljoin(self._url, urn),
                      params=params).prepare().url
        # NOTE: race condition, but it should be harmless
        if self._session.cache.has_url(url):
            response = self._request_cache(url)
        else:
            response = self._request_throttle(url)

        res = loads(response.text)
        if response.status_code == 200:
            res["cached"] = response.from_cache
            return res
        else:
            raise response.raise_for_status()
예제 #5
0
파일: fetch.py 프로젝트: biologyguy/jcvi
def sra(args):
    """
    %prog sra term

    Given an SRA run ID, fetch the corresponding .sra file
    from the sra-instant FTP
    """
    sra_base_url = "ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/"
    sra_run_id_re = re.compile(r'^([DES]{1}RR)(\d{3})(\d{3,4})$')

    p = OptionParser(sra.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    term, = args

    m = re.search(sra_run_id_re, term)
    if m is None:
        logging.error("Incorrect SRA identifier format " + \
                "[should be like SRR126150, SRR1001901. " + \
                "len(identifier) should be between 9-10 characters]")
        sys.exit()

    prefix, subprefix = m.group(1), "{0}{1}".format(m.group(1), m.group(2))
    download_url = urljoin(sra_base_url, prefix, subprefix, term,
                           "{0}.sra".format(term))

    logging.debug("Downloading file: {0}".format(download_url))
    download(download_url)
def get_features(refseq, start, end, strand, featuretype, completely_within=True, level=1):
    """
    Return the features within a given set of chromosome coordinates
    """
    url = urljoin(THALEMINE_BASE_URL, "jbrowse", TAXID, "features", refseq)
    data = tools.do_request(url, None, start=start, end=end, type=featuretype)

    elems_to_delete = []
    for x, elem0 in enumerate(data['features']):
        # remove feature if not completely_within specified chromosome coordinates
        if completely_within and (elem0['start'] < start or elem0['end'] > end):
            elems_to_delete.append(x)
            continue
        # remove all subfeatures below 0th-level object
        if level == 0:
            data['features'][x]['subfeatures'] = []
        # remove all subfeatures below 1st-level object
        elif level == 1:
            for y, elem1 in enumerate(elem0['subfeatures']):
                data['features'][x]['subfeatures'][y]['subfeatures'] = []

    for i in sorted(elems_to_delete, reverse=True):
        del data['features'][i]

    return data
def get_global_stats(featuretype):
    """
    Return global stats for features of specific type
    """
    url = urljoin(THALEMINE_BASE_URL, "jbrowse", TAXID, "stats", "global")
    global_stats = tools.do_request(url, None, type=featuretype)

    return global_stats
def get_region_feature_densities(refseq, start, end, featuretype):
    """
    Return binned density stats for features within a given set of chromosome coordinates
    """
    url = urljoin(THALEMINE_BASE_URL, "jbrowse", TAXID, "stats", "regionFeatureDensities", refseq)
    region_feature_densities = tools.do_request(url, None, start=start, end=end, type=featuretype)

    return region_feature_densities
예제 #9
0
def download_species_phytozome9(species, valid_species, base_url, assembly=False):
    assert species in valid_species, "{} is not in the species list".format(species)

    # We want to download assembly and annotation for given species
    surl = urljoin(base_url, species)
    contents = [x for x in ls_ftp(surl) if x.endswith("_readme.txt")]
    magic = contents[0].split("_")[1]  # Get the magic number
    logging.debug("Found magic number for {0}: {1}".format(species, magic))

    pf = "{0}_{1}".format(species, magic)
    asm_url = urljoin(surl, "assembly/{0}.fa.gz".format(pf))
    ann_url = urljoin(surl, "annotation/{0}_gene.gff3.gz".format(pf))
    cds_url = urljoin(surl, "annotation/{0}_cds.fa.gz".format(pf))
    res = {}
    if assembly:
        res["asm"] = download(asm_url)
    res["gff"] = download(ann_url)
    res["cds"] = download(cds_url)
    return res
예제 #10
0
    def do(self):
        "run it, get a new url"
        scheme, netloc, path, params, query, fragment = Split(self.url).do()

        if isinstance(self.query, dict):
            query = query + "&" + urllib.urlencode(self.query) if query else urllib.urlencode(self.query)

        path = urljoin(path, self.path).replace('\\', '/') if self.path else path
        
        return Splice(scheme=scheme, netloc=netloc, path=path, params=params, query=query, fragment=fragment).geturl
예제 #11
0
def download_species_phytozome(species, valid_species, url, assembly=False):
    from os.path import join as urljoin

    assert species in valid_species, \
            "{0} is not in the species list".format(species)

    # We want to download assembly and annotation for given species
    surl = urljoin(url, species)
    contents = [x for x in ls_ftp(surl) if x.endswith("_readme.txt")]
    magic = contents[0].split("_")[1]  # Get the magic number
    logging.debug("Found magic number for {0}: {1}".format(species, magic))

    pf = "{0}_{1}".format(species, magic)
    asm_url = urljoin(surl, "assembly/{0}.fa.gz".format(pf))
    ann_url = urljoin(surl, "annotation/{0}_gene.gff3.gz".format(pf))
    cds_url = urljoin(surl, "annotation/{0}_cds.fa.gz".format(pf))
    if assembly:
        download(asm_url)
    for u in (ann_url, cds_url):
        download(u)
예제 #12
0
파일: fetch.py 프로젝트: yangjl/jcvi
def download_species_phytozome(species, valid_species, url, assembly=False):
    from os.path import join as urljoin

    assert species in valid_species, \
            "{0} is not in the species list".format(species)

    # We want to download assembly and annotation for given species
    surl = urljoin(url, species)
    contents = [x for x in ls_ftp(surl) if x.endswith("_readme.txt")]
    magic = contents[0].split("_")[1]  # Get the magic number
    logging.debug("Found magic number for {0}: {1}".format(species, magic))

    pf = "{0}_{1}".format(species, magic)
    asm_url = urljoin(surl, "assembly/{0}.fa.gz".format(pf))
    ann_url = urljoin(surl, "annotation/{0}_gene.gff3.gz".format(pf))
    cds_url = urljoin(surl, "annotation/{0}_cds.fa.gz".format(pf))
    if assembly:
        download(asm_url)
    for u in (ann_url, cds_url):
        download(u)
예제 #13
0
파일: fetch.py 프로젝트: xuanblo/jcvi
def download_srr_term(term):
    sra_base_url = "ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/"
    sra_run_id_re = re.compile(r'^([DES]{1}RR)(\d{3})(\d{3,4})$')

    m = re.search(sra_run_id_re, term)
    if m is None:
        logging.error("Incorrect SRA identifier format " + \
                "[should be like SRR126150, SRR1001901. " + \
                "len(identifier) should be between 9-10 characters]")
        sys.exit()

    prefix, subprefix = m.group(1), "{0}{1}".format(m.group(1), m.group(2))
    download_url = urljoin(sra_base_url, prefix, subprefix, term, "{0}.sra".format(term))

    logging.debug("Downloading file: {0}".format(download_url))
    return download(download_url)
예제 #14
0
def download_srr_term(term):
    sra_base_url = "ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByRun/sra/"
    sra_run_id_re = re.compile(r'^([DES]{1}RR)(\d{3})(\d{3,4})$')

    m = re.search(sra_run_id_re, term)
    if m is None:
        logging.error("Incorrect SRA identifier format " + \
                "[should be like SRR126150, SRR1001901. " + \
                "len(identifier) should be between 9-10 characters]")
        sys.exit()

    prefix, subprefix = m.group(1), "{0}{1}".format(m.group(1), m.group(2))
    download_url = urljoin(sra_base_url, prefix, subprefix, term, "{0}.sra".format(term))

    logging.debug("Downloading file: {0}".format(download_url))
    return download(download_url)
예제 #15
0
def rule(services, settings):
    if not settings.ONE_DOMAIN_MODE:
        return
    API_URL = urljoin(settings.API_URL, "api")
    STORE_AVAILABLE = services.get("store")
    ADMIN_AVAILABLE = services.get("admin")
    BACKEND_AVAILABLE = services.get("backend")
    # replace defaults
    if STORE_AVAILABLE:
        with modify_key(services, "store", "environment") as environment:
            environment["BITCART_STORE_API_URL"] = API_URL
        if ADMIN_AVAILABLE:
            with modify_key(services, "admin", "environment") as environment:
                environment["BITCART_ADMIN_ROOTPATH"] = environment[
                    "BITCART_ADMIN_ROOTPATH"].replace("/", "/admin")
                environment["BITCART_ADMIN_API_URL"] = API_URL
    elif ADMIN_AVAILABLE:
        with modify_key(services, "admin", "environment") as environment:
            environment["BITCART_ADMIN_API_URL"] = API_URL
    if BACKEND_AVAILABLE and (ADMIN_AVAILABLE or STORE_AVAILABLE):
        with modify_key(services, "backend", "environment") as environment:
            environment["BITCART_BACKEND_ROOTPATH"] = environment[
                "BITCART_BACKEND_ROOTPATH"].replace("-}", "-/api}")
예제 #16
0
def entrez(args):
    """
    %prog entrez <filename|term>

    `filename` contains a list of terms to search. Or just one term. If the
    results are small in size, e.g. "--format=acc", use "--batchsize=100" to speed
    the download.
    """
    p = OptionParser(entrez.__doc__)

    allowed_databases = {
        "fasta": ["genome", "nuccore", "nucgss", "protein", "nucest"],
        "asn.1": ["genome", "nuccore", "nucgss", "protein", "gene"],
        "xml": ["genome", "nuccore", "nucgss", "nucest", "gene"],
        "gb": ["genome", "nuccore", "nucgss"],
        "est": ["nucest"],
        "gss": ["nucgss"],
        "acc": ["nuccore"],
    }

    valid_formats = tuple(allowed_databases.keys())
    valid_databases = ("genome", "nuccore", "nucest", "nucgss", "protein", "gene")

    p.add_option(
        "--noversion",
        dest="noversion",
        default=False,
        action="store_true",
        help="Remove trailing accession versions",
    )
    p.add_option(
        "--format",
        default="fasta",
        choices=valid_formats,
        help="download format",
    )
    p.add_option(
        "--database",
        default="nuccore",
        choices=valid_databases,
        help="search database",
    )
    p.add_option(
        "--retmax",
        default=1000000,
        type="int",
        help="how many results to return",
    )
    p.add_option(
        "--skipcheck",
        default=False,
        action="store_true",
        help="turn off prompt to check file existence",
    )
    p.add_option(
        "--batchsize",
        default=500,
        type="int",
        help="download the results in batch for speed-up",
    )
    p.set_outdir(outdir=None)
    p.add_option("--outprefix", default="out", help="output file name prefix")
    p.set_email()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    (filename,) = args
    if op.exists(filename):
        pf = filename.rsplit(".", 1)[0]
        list_of_terms = [row.strip() for row in open(filename)]
        if opts.noversion:
            list_of_terms = [x.rsplit(".", 1)[0] for x in list_of_terms]
    else:
        pf = filename
        # the filename is the search term
        list_of_terms = [filename.strip()]

    fmt = opts.format
    database = opts.database
    batchsize = opts.batchsize

    assert (
        database in allowed_databases[fmt]
    ), "For output format '{0}', allowed databases are: {1}".format(
        fmt, allowed_databases[fmt]
    )
    assert batchsize >= 1, "batchsize must >= 1"

    if " " in pf:
        pf = opts.outprefix

    outfile = "{0}.{1}".format(pf, fmt)

    outdir = opts.outdir
    if outdir:
        mkdir(outdir)

    # If noprompt, will not check file existence
    if not outdir:
        fw = must_open(outfile, "w", checkexists=True, skipcheck=opts.skipcheck)
        if fw is None:
            return

    seen = set()
    totalsize = 0
    for id, size, term, handle in batch_entrez(
        list_of_terms,
        retmax=opts.retmax,
        rettype=fmt,
        db=database,
        batchsize=batchsize,
        email=opts.email,
    ):
        if outdir:
            outfile = urljoin(outdir, "{0}.{1}".format(term, fmt))
            fw = must_open(outfile, "w", checkexists=True, skipcheck=opts.skipcheck)
            if fw is None:
                continue

        rec = handle.read()
        if id in seen:
            logging.error("Duplicate key ({0}) found".format(rec))
            continue

        totalsize += size
        print(rec, file=fw)
        print(file=fw)

        seen.add(id)

    if seen:
        printf(
            "A total of {0} {1} records downloaded.".format(totalsize, fmt.upper()),
        )

    return outfile
def search(args):
    """
    args contains a dict with one or key:values

    transcript is AGI identifier and is mandatory
    material is the tissue or treatment and is restricted below to a limited list
    """

    """
	In the future, ADAMA will check a query, map_*, or generic request against a list of mandatory
	parameters specified for each service. For now, if we want to enforce that behavior we need to
	implement it ourselves.

	ADAMA will have a graceful, cross-language exception handling scheme in a future release
	At present, we are hand-coding a return
    """
    if not args.viewkeys() & {'material1', 'material2', 'foldchange'}:
        return

    """
    Check that foldchange is a valid number
    """
    foldchange = args['foldchange']
    try:
        n = float(foldchange)
    except (ValueError, TypeError):
        return

    """
    Check materials to make sure they're in the (hard-coded) approved list
    """
    valid_materials = { 'flower': 'Flo', 'iaa': 'IAA', 'leaf': 'Lea', \
            'root': 'Roo', 'salicylic': 'Sal', 'nacl': 'NaC', \
            'young': 'You', 't87': 'T87'}

    material1 = args['material1'].lower()
    if material1 not in valid_materials.keys():
        return
    tissue1 = valid_materials[material1]

    material2 = args['material2'].lower()
    if material2 not in valid_materials.keys():
        return
    tissue2 = valid_materials[material2]

    """
	Build the url from the base + the intended endpoint action
    Also encode the params (payload) into a dict
    """
    url = urljoin(jcvi_common.base_url(), 'ExpressionConditionComparison')
    payload = { 'tissue1': tissue1, 'tissue2': tissue2, 'change': foldchange }

    """
    Make the request to the remote service
    """
    r = requests.get(url, params=payload)

    """
    Iterate through the results
    Foreach record from the remote service, build the response json
    Print this json to stdout followed by a record separator "---"
    ADAMA takes care of serializing these results
    """
    p = re.compile('AT[1-5MC]G[0-9]{5,5}\.[0-9]+', re.IGNORECASE)
    for result in r.json()['compare_table']:

        # check that transcript uses a valid transcript identifier
        transcript = result['elem_target_id']
        if not p.search(transcript):
            continue

        record = {
                'transcript': transcript,
                'class': 'transcript_property',
                'source_text_description': 'RT-PCR',
                'expression_comparison_record': {
                        'material1_text_description': result['elem_tissue1'],
                        'expression_value_material1': result['elem_tissue1_value'],
                        'expression_value_material1_stdev': result['elem_tissue1_value2'],
                        'material2_text_description': result['elem_tissue2'],
                        'expression_value_material2': result['elem_tissue2_value'],
                        'expression_value_material2_stdev': result['elem_tissue2_value2']
                }
            }
        print json.dumps(record, indent=2)
        print '---'
예제 #18
0
 def __init__(self, request, endpoint):
     self.request = lambda x: request(urljoin(endpoint, "price-conversion"),
                                      args(**x))
예제 #19
0
 def __init__(self, request, endpoint):
     self.request = lambda x: request(urljoin(endpoint, "map"), x)
예제 #20
0
 def __init__(self, request, endpoint):
     self.request = lambda x, y: request(urljoin(endpoint, "quotes", x),
                                         args(**y))
예제 #21
0
 def __init__(self, request, endpoint):
     self.request = lambda x: request(
         urljoin(endpoint, "market-pairs/latest"), args(**x))
예제 #22
0
 def __init__(self, request, endpoint):
     self.request = lambda x: request(urljoin(endpoint, "info"), args(**x))
예제 #23
0
파일: fetch.py 프로젝트: yangjl/jcvi
def entrez(args):
    """
    %prog entrez <filename|term>

    `filename` contains a list of terms to search. Or just one term. If the
    results are small in size, e.g. "--format=acc", use "--batchsize=100" to speed
    the download.
    """
    p = OptionParser(entrez.__doc__)

    allowed_databases = {"fasta": ["genome", "nuccore", "nucgss", "protein", "nucest"],
                         "asn.1": ["genome", "nuccore", "nucgss", "protein"],
                         "gb"   : ["genome", "nuccore", "nucgss"],
                         "est"  : ["nucest"],
                         "gss"  : ["nucgss"],
                         "acc"  : ["nuccore"],
                        }

    valid_formats = tuple(allowed_databases.keys())
    valid_databases = ("genome", "nuccore", "nucest", "nucgss", "protein")

    p.add_option("--noversion", dest="noversion",
            default=False, action="store_true",
            help="Remove trailing accession versions")
    p.add_option("--format", default="fasta", choices=valid_formats,
            help="download format [default: %default]")
    p.add_option("--database", default="nuccore", choices=valid_databases,
            help="search database [default: %default]")
    p.add_option("--retmax", default=1000000, type="int",
            help="how many results to return [default: %default]")
    p.add_option("--skipcheck", default=False, action="store_true",
            help="turn off prompt to check file existence [default: %default]")
    p.add_option("--batchsize", default=500, type="int",
            help="download the results in batch for speed-up [default: %default]")
    p.add_option("--outdir", default=None,
            help="output directory, with accession number as filename")
    p.add_option("--outprefix", default="out",
            help="output file name prefix [default: %default]")
    p.set_email()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    filename, = args
    if op.exists(filename):
        pf = filename.rsplit(".", 1)[0]
        list_of_terms = [row.strip() for row in open(filename)]
        if opts.noversion:
            list_of_terms = [x.rsplit(".", 1)[0] for x in list_of_terms]
    else:
        pf = filename
        # the filename is the search term
        list_of_terms = [filename.strip()]

    fmt = opts.format
    database = opts.database
    batchsize = opts.batchsize

    assert database in allowed_databases[fmt], \
        "For output format '{0}', allowed databases are: {1}".\
        format(fmt, allowed_databases[fmt])
    assert batchsize >= 1, "batchsize must >= 1"

    if " " in pf:
        pf = opts.outprefix

    outfile = "{0}.{1}".format(pf, fmt)

    outdir = opts.outdir
    if outdir:
        mkdir(outdir)

    # If noprompt, will not check file existence
    if not outdir:
        fw = must_open(outfile, "w", checkexists=True, \
                skipcheck=opts.skipcheck)
        if fw is None:
            return

    seen = set()
    totalsize = 0
    for id, size, term, handle in batch_entrez(list_of_terms, retmax=opts.retmax, \
                                 rettype=fmt, db=database, batchsize=batchsize, \
                                 email=opts.email):
        if outdir:
            outfile = urljoin(outdir, "{0}.{1}".format(term, fmt))
            fw = must_open(outfile, "w", checkexists=True, \
                    skipcheck=opts.skipcheck)
            if fw is None:
                continue

        rec = handle.read()
        if id in seen:
            logging.error("Duplicate key ({0}) found".format(rec))
            continue

        totalsize += size
        print >> fw, rec
        print >> fw

        seen.add(id)

    if seen:
        print >> sys.stderr, "A total of {0} {1} records downloaded.".\
                format(totalsize, fmt.upper())

    return outfile
예제 #24
0
def urljoin(url, path):
    return path.urljoin(url, path)